Adds a new column to tweets in db : invalid

Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over.

I might have to think about a better way to perform database updates if I do it often.
This commit is contained in:
Julien Lengrand-Lambert
2013-01-10 10:35:41 +01:00
parent 5e6be14555
commit 01e128f494
6 changed files with 39 additions and 3 deletions

0
db_changes/__init__.py Normal file
View File

17
db_changes/v01.py Normal file
View File

@@ -0,0 +1,17 @@
"""
Changes made between v00 and v01
Operation is :
- Add new column of type boolean and name invalid in Tweet table
"""
import sqlite3
db_name = '/home/jll/Documents/code/twitterboard/twiderboard.db'
con = sqlite3.connect(db_name)
c = con.cursor()
c.execute("ALTER TABLE tweets ADD COLUMN 'invalid' BOOLEAN")
con.commit()
c.close()
print "database updated to v01"

Binary file not shown.

View File

@@ -129,10 +129,23 @@ class Counter():
self.logger.error("ElementException : More than one member found !")
raise ElementException # FIXME : Take care
self.flush(session)
except ElementException:
self.invalidate(Tweet)
self.logger.error("ElementException : Could not process %s !" % (tweet))
self.flush(session)
def invalidate(self, tweet):
"""
Invalidates a tweet so that it is not recrawled by the counter
and can be verified later
"""
tweet.invalid = True
tweet.crawled = True
session.add(tweet)
self.cpt += 1 # indicates that we have a candidiate for the flushing
def update(self, session, member, tweet):
"""
Updates member values.

View File

@@ -17,5 +17,7 @@ root = '/home/jll/Documents/code/twitterboard/'
# TODO: do that correctly
engine_url = 'sqlite:///twiderboard.db'
log_name = 'board.log'
log_path = os.path.join(root, log_name)

View File

@@ -78,6 +78,9 @@ class Tweet(Base):
crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already
source = Column(String) # Where tweet comes from
# Boolean that is set to True if Tweet cannot be processed correctly
invalid = Column(Boolean)
def __init__(self, author, created, inserted, crawled, source, text):
self.eu = EncodingUtils() # used to switch to unicode
@@ -91,7 +94,8 @@ class Tweet(Base):
self.hashtags = self.extract_hashtags()
# BETTER, but see how it works
self.invalid = False # cannot be invalid by default
def extract_hashtags(self):
"""
Extracts all the hashtags that are present in the tweet
@@ -127,4 +131,4 @@ class Tweet(Base):
try:
return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
except UnicodeDecodeError:
return "Contains Unicode!!"
return "Contains Unicode!!"