import re import datetime from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column from sqlalchemy import Integer from sqlalchemy import String from sqlalchemy import Boolean from sqlalchemy import DateTime from encodingUtils import EncodingUtils engine_url = "sqlite:///twiderboard.db" Base = declarative_base() class Member(Base): """ Represents an entry in a leaderboard. An entry in the leaderboard is fully represented by the name of the poster, the number of tweets he posted and the corresponding hashtag Some more information can be stored in the db, such as the last update or the creation date """ __tablename__ = "member" id = Column(Integer, primary_key=True) author = Column(String) # name of the guy that tweeted hashtag = Column(String) # name of the hashtag of the tweet created = Column(DateTime) # date of creation of the member updated = Column(DateTime) # date of last count update count = Column(Integer) # Number of tweets for this couple author/hashtag def __init__(self, author, hashtag, count=0): self.author = author self.hashtag = hashtag self.created = datetime.datetime.now() self.updated = datetime.datetime.now() self.count = count def increment(self): """ Increments the count value """ self.count += 1 def update(self): self.increment() self.updated = datetime.datetime.now() def has_author(self): """ Returns True if author is not empty or null """ return (len(self.author) != 0 and self.author is not None) def has_hashtag(self): """ Returns True if hashtag is not empty or null """ return (len(self.hashtag) != 0 and self.hashtag is not None) def __repr__(self): return "<%s('%s' on'%s' last '%s') count: %s>" % (self.author, self.hashtag, self.created, self.updated, self.count) class Tweet(Base): """ Class that fully represents a tweet as it is stored in the database. It is different from the structure that can be found in tweepy """ __tablename__ = "tweets" id = Column(Integer, primary_key=True) hashtag = Column(String) # Hashtag that is tracked text = Column(String) # Content of the tweet author = Column(String) # name of the tweeter created = Column(String) # FIXME: Change to date. Date at which message was tweeted inserted = Column(DateTime) # Date at which tweet was saved in db crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already source = Column(String) # Where tweet comes from # Boolean that is set to True if Tweet cannot be processed correctly invalid = Column(Boolean) def __init__(self, author, created, inserted, crawled, source, text): self.eu = EncodingUtils() # used to switch to unicode self.author = self.eu.to_unicode(author) self.created = self.eu.to_unicode(created) self.crawled = crawled self.inserted = inserted self.source = self.eu.to_unicode(source) self.hashtag = '' self.text = self.eu.to_unicode(text) self.hashtags = self.extract_hashtags() self.invalid = False # cannot be invalid by default def extract_hashtags(self): """ Extracts all the hashtags that are present in the tweet FIXME: Problem here is that we lose lots of tags because they end/start with special characters! """ return set(part[:] for part in self.text.split() if part.startswith('#')) #return re.findall(r"#(\w+)", self.text) def get_main_tag(self, trendy): """ Given a list of tracked hashtag, defines the most important one """ in_hashs = [i.lower() for i in self.hashtags] trend_hashs = [i.lower() for i in trendy] match = [i for i in in_hashs if i in trend_hashs] if len(match) != 0: self.hashtag = self.eu.to_unicode(match[0]) def has_author(self): """ Returns True if author is not empty or null """ return (len(self.author) != 0 and self.author is not None) def has_hashtag(self): """ Returns True if hashtag is not empty or null """ return (len(self.hashtag) != 0 and self.hashtag is not None) def __repr__(self): try: return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8')) except UnicodeDecodeError: return "Contains Unicode!!"