Files
twitterboard/twiderboard/datamodel.py
Julien Lengrand-Lambert 01e128f494 Adds a new column to tweets in db : invalid
Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over.

I might have to think about a better way to perform database updates if I do it often.
2013-01-10 10:35:41 +01:00

135 lines
4.6 KiB
Python

import re
import datetime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column
from sqlalchemy import Integer
from sqlalchemy import String
from sqlalchemy import Boolean
from sqlalchemy import DateTime
from encodingUtils import EncodingUtils
engine_url = "sqlite:///twiderboard.db"
Base = declarative_base()
class Member(Base):
"""
Represents an entry in a leaderboard.
An entry in the leaderboard is fully represented by the name of the poster,
the number of tweets he posted and the corresponding hashtag
Some more information can be stored in the db, such as the last update
or the creation date
"""
__tablename__ = "member"
id = Column(Integer, primary_key=True)
author = Column(String) # name of the guy that tweeted
hashtag = Column(String) # name of the hashtag of the tweet
created = Column(DateTime) # date of creation of the member
updated = Column(DateTime) # date of last count update
count = Column(Integer) # Number of tweets for this couple author/hashtag
def __init__(self, author, hashtag, count=0):
self.author = author
self.hashtag = hashtag
self.created = datetime.datetime.now()
self.updated = datetime.datetime.now()
self.count = count
def increment(self):
"""
Increments the count value
"""
self.count += 1
def update(self):
self.increment()
self.updated = datetime.datetime.now()
def has_author(self):
"""
Returns True if author is not empty or null
"""
return (len(self.author) != 0 and self.author is not None)
def has_hashtag(self):
"""
Returns True if hashtag is not empty or null
"""
return (len(self.hashtag) != 0 and self.hashtag is not None)
def __repr__(self):
return "<%s('%s' on'%s' last '%s') count: %s>" % (self.author, self.hashtag, self.created, self.updated, self.count)
class Tweet(Base):
"""
Class that fully represents a tweet as it is stored in the database.
It is different from the structure that can be found in tweepy
"""
__tablename__ = "tweets"
id = Column(Integer, primary_key=True)
hashtag = Column(String) # Hashtag that is tracked
text = Column(String) # Content of the tweet
author = Column(String) # name of the tweeter
created = Column(String) # FIXME: Change to date. Date at which message was tweeted
inserted = Column(DateTime) # Date at which tweet was saved in db
crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already
source = Column(String) # Where tweet comes from
# Boolean that is set to True if Tweet cannot be processed correctly
invalid = Column(Boolean)
def __init__(self, author, created, inserted, crawled, source, text):
self.eu = EncodingUtils() # used to switch to unicode
self.author = self.eu.to_unicode(author)
self.created = self.eu.to_unicode(created)
self.crawled = crawled
self.inserted = inserted
self.source = self.eu.to_unicode(source)
self.hashtag = ''
self.text = self.eu.to_unicode(text)
self.hashtags = self.extract_hashtags()
self.invalid = False # cannot be invalid by default
def extract_hashtags(self):
"""
Extracts all the hashtags that are present in the tweet
FIXME: Problem here is that we lose lots of tags because they end/start
with special characters!
"""
return set(part[:] for part in self.text.split() if part.startswith('#'))
#return re.findall(r"#(\w+)", self.text)
def get_main_tag(self, trendy):
"""
Given a list of tracked hashtag, defines the most important one
"""
in_hashs = [i.lower() for i in self.hashtags]
trend_hashs = [i.lower() for i in trendy]
match = [i for i in in_hashs if i in trend_hashs]
if len(match) != 0:
self.hashtag = self.eu.to_unicode(match[0])
def has_author(self):
"""
Returns True if author is not empty or null
"""
return (len(self.author) != 0 and self.author is not None)
def has_hashtag(self):
"""
Returns True if hashtag is not empty or null
"""
return (len(self.hashtag) != 0 and self.hashtag is not None)
def __repr__(self):
try:
return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
except UnicodeDecodeError:
return "Contains Unicode!!"