From 01e128f4949862046a5404e06938bd460ce72b3c Mon Sep 17 00:00:00 2001 From: Julien Lengrand-Lambert Date: Thu, 10 Jan 2013 10:35:41 +0100 Subject: [PATCH] Adds a new column to tweets in db : invalid Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over. I might have to think about a better way to perform database updates if I do it often. --- db_changes/__init__.py | 0 db_changes/v01.py | 17 +++++++++++++++++ twiderboard.db | Bin 149504 -> 149504 bytes twiderboard/counter.py | 15 ++++++++++++++- twiderboard/data.py | 2 ++ twiderboard/datamodel.py | 8 ++++++-- 6 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 db_changes/__init__.py create mode 100644 db_changes/v01.py diff --git a/db_changes/__init__.py b/db_changes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db_changes/v01.py b/db_changes/v01.py new file mode 100644 index 0000000..d6134d6 --- /dev/null +++ b/db_changes/v01.py @@ -0,0 +1,17 @@ +""" +Changes made between v00 and v01 + +Operation is : +- Add new column of type boolean and name invalid in Tweet table +""" +import sqlite3 + +db_name = '/home/jll/Documents/code/twitterboard/twiderboard.db' + +con = sqlite3.connect(db_name) +c = con.cursor() +c.execute("ALTER TABLE tweets ADD COLUMN 'invalid' BOOLEAN") +con.commit() +c.close() + +print "database updated to v01" \ No newline at end of file diff --git a/twiderboard.db b/twiderboard.db index d3f1039c18280c21aa0784f9e772e544ebebfc37..987c098d50facdc6f8a649461403026bbd7f4b19 100644 GIT binary patch delta 114 zcmZpez}YZ?bAq&}0|NuYBp`+XW*~!UqJh4k1B31(RbFOR1|~*Z21Z*Z@5vWg&T=(z zGqQ_IN-{PVPUdGmAgG?1SC*KQnWC=XgYH5EVJ=sZ>C)=Kx2Lbb$oAPX=7nr I#=_(R00ZF~ivR!s delta 92 zcmZpez}YZ?bAq&}Jp%*7Bp`+XCLn`xqJh4kJ%jEeRbC*EiOHLRnU%?VyP^xzWX6dN m9+SP9c5yXUFtUqFN-{QkPj+WKFje}RlM4X$#S$<8 diff --git a/twiderboard/counter.py b/twiderboard/counter.py index 1f452d5..c125131 100644 --- a/twiderboard/counter.py +++ b/twiderboard/counter.py @@ -129,10 +129,23 @@ class Counter(): self.logger.error("ElementException : More than one member found !") raise ElementException # FIXME : Take care - self.flush(session) except ElementException: + self.invalidate(Tweet) self.logger.error("ElementException : Could not process %s !" % (tweet)) + self.flush(session) + + def invalidate(self, tweet): + """ + Invalidates a tweet so that it is not recrawled by the counter + and can be verified later + """ + tweet.invalid = True + tweet.crawled = True + session.add(tweet) + + self.cpt += 1 # indicates that we have a candidiate for the flushing + def update(self, session, member, tweet): """ Updates member values. diff --git a/twiderboard/data.py b/twiderboard/data.py index fb6a21d..8f627a7 100644 --- a/twiderboard/data.py +++ b/twiderboard/data.py @@ -17,5 +17,7 @@ root = '/home/jll/Documents/code/twitterboard/' # TODO: do that correctly engine_url = 'sqlite:///twiderboard.db' + + log_name = 'board.log' log_path = os.path.join(root, log_name) diff --git a/twiderboard/datamodel.py b/twiderboard/datamodel.py index a91dfed..2cee006 100644 --- a/twiderboard/datamodel.py +++ b/twiderboard/datamodel.py @@ -78,6 +78,9 @@ class Tweet(Base): crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already source = Column(String) # Where tweet comes from + # Boolean that is set to True if Tweet cannot be processed correctly + invalid = Column(Boolean) + def __init__(self, author, created, inserted, crawled, source, text): self.eu = EncodingUtils() # used to switch to unicode @@ -91,7 +94,8 @@ class Tweet(Base): self.hashtags = self.extract_hashtags() - # BETTER, but see how it works + self.invalid = False # cannot be invalid by default + def extract_hashtags(self): """ Extracts all the hashtags that are present in the tweet @@ -127,4 +131,4 @@ class Tweet(Base): try: return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8')) except UnicodeDecodeError: - return "Contains Unicode!!" \ No newline at end of file + return "Contains Unicode!!"