From 01e128f4949862046a5404e06938bd460ce72b3c Mon Sep 17 00:00:00 2001
From: Julien Lengrand-Lambert <julien@lengrand.fr>
Date: Thu, 10 Jan 2013 10:35:41 +0100
Subject: [PATCH] Adds a new column to tweets in db : invalid

Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over.

I might have to think about a better way to perform database updates if I do it often.
---
 db_changes/__init__.py   |   0
 db_changes/v01.py        |  17 +++++++++++++++++
 twiderboard.db           | Bin 149504 -> 149504 bytes
 twiderboard/counter.py   |  15 ++++++++++++++-
 twiderboard/data.py      |   2 ++
 twiderboard/datamodel.py |   8 ++++++--
 6 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 db_changes/__init__.py
 create mode 100644 db_changes/v01.py

diff --git a/db_changes/__init__.py b/db_changes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/db_changes/v01.py b/db_changes/v01.py
new file mode 100644
index 0000000..d6134d6
--- /dev/null
+++ b/db_changes/v01.py
@@ -0,0 +1,17 @@
+"""
+Changes made between v00 and v01
+
+Operation is :
+- Add new column of type boolean and name invalid in Tweet table
+"""
+import sqlite3
+
+db_name = '/home/jll/Documents/code/twitterboard/twiderboard.db'
+
+con = sqlite3.connect(db_name)
+c = con.cursor()
+c.execute("ALTER TABLE tweets ADD COLUMN 'invalid' BOOLEAN")
+con.commit()
+c.close()
+
+print "database updated to v01"
\ No newline at end of file
diff --git a/twiderboard.db b/twiderboard.db
index d3f1039c18280c21aa0784f9e772e544ebebfc37..987c098d50facdc6f8a649461403026bbd7f4b19 100644
GIT binary patch
delta 114
zcmZpez}YZ?bAq&}0|NuYBp`+XW*~!UqJh4k1B31(RbFOR1|~*Z21Z*Z@5vWg&T=(z
zGqQ_IN-{PVPUdGmAgG?1SC*KQnWC=X<nQm}>gYH5EVJ=sZ>C)=Kx2Lbb$oAPX=7nr
I#=_(R00ZF~ivR!s

delta 92
zcmZpez}YZ?bAq&}Jp%*7Bp`+XCLn`xqJh4kJ%jEeRbC*EiOHLRnU%?VyP^xzWX6dN
m9+SP9c5yXUFtUqFN-{QkPj+WKFj<Ppu!*IOg>e}RlM4X$#S$<8

diff --git a/twiderboard/counter.py b/twiderboard/counter.py
index 1f452d5..c125131 100644
--- a/twiderboard/counter.py
+++ b/twiderboard/counter.py
@@ -129,10 +129,23 @@ class Counter():
                     self.logger.error("ElementException :  More than one member found !")
                     raise ElementException  # FIXME : Take care
 
-                self.flush(session)
             except ElementException:
+                self.invalidate(Tweet)
                 self.logger.error("ElementException :  Could not process %s !" % (tweet))
 
+            self.flush(session)
+
+    def invalidate(self, tweet):
+        """
+        Invalidates a tweet so that it is not recrawled by the counter
+        and can be verified later
+        """
+        tweet.invalid = True
+        tweet.crawled = True
+        session.add(tweet)
+
+        self.cpt += 1  # indicates that we have a candidiate for the flushing
+
     def update(self, session, member, tweet):
         """
         Updates member values.
diff --git a/twiderboard/data.py b/twiderboard/data.py
index fb6a21d..8f627a7 100644
--- a/twiderboard/data.py
+++ b/twiderboard/data.py
@@ -17,5 +17,7 @@ root = '/home/jll/Documents/code/twitterboard/'
 # TODO: do that correctly
 
 engine_url = 'sqlite:///twiderboard.db'
+
+
 log_name = 'board.log'
 log_path = os.path.join(root, log_name)
diff --git a/twiderboard/datamodel.py b/twiderboard/datamodel.py
index a91dfed..2cee006 100644
--- a/twiderboard/datamodel.py
+++ b/twiderboard/datamodel.py
@@ -78,6 +78,9 @@ class Tweet(Base):
     crawled = Column(Boolean)  # Boolean whether or not tweet is in statistics already
     source = Column(String)  # Where tweet comes from
 
+    # Boolean that is set to True if Tweet cannot be processed correctly
+    invalid = Column(Boolean)
+
     def __init__(self, author, created, inserted, crawled, source, text):
         self.eu = EncodingUtils()  # used to switch to unicode
 
@@ -91,7 +94,8 @@ class Tweet(Base):
 
         self.hashtags = self.extract_hashtags()
 
-    # BETTER, but see how it works
+        self.invalid = False  # cannot be invalid by default
+
     def extract_hashtags(self):
         """
         Extracts all the hashtags that are present in the tweet
@@ -127,4 +131,4 @@ class Tweet(Base):
             try:
                 return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
             except UnicodeDecodeError:
-                return "Contains Unicode!!"
\ No newline at end of file
+                return "Contains Unicode!!"