mirror of
https://github.com/jlengrand/twitterboard.git
synced 2026-03-10 00:41:18 +00:00
Adds a new column to tweets in db : invalid
Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over. I might have to think about a better way to perform database updates if I do it often.
This commit is contained in:
0
db_changes/__init__.py
Normal file
0
db_changes/__init__.py
Normal file
17
db_changes/v01.py
Normal file
17
db_changes/v01.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
Changes made between v00 and v01
|
||||
|
||||
Operation is :
|
||||
- Add new column of type boolean and name invalid in Tweet table
|
||||
"""
|
||||
import sqlite3
|
||||
|
||||
db_name = '/home/jll/Documents/code/twitterboard/twiderboard.db'
|
||||
|
||||
con = sqlite3.connect(db_name)
|
||||
c = con.cursor()
|
||||
c.execute("ALTER TABLE tweets ADD COLUMN 'invalid' BOOLEAN")
|
||||
con.commit()
|
||||
c.close()
|
||||
|
||||
print "database updated to v01"
|
||||
BIN
twiderboard.db
BIN
twiderboard.db
Binary file not shown.
@@ -129,10 +129,23 @@ class Counter():
|
||||
self.logger.error("ElementException : More than one member found !")
|
||||
raise ElementException # FIXME : Take care
|
||||
|
||||
self.flush(session)
|
||||
except ElementException:
|
||||
self.invalidate(Tweet)
|
||||
self.logger.error("ElementException : Could not process %s !" % (tweet))
|
||||
|
||||
self.flush(session)
|
||||
|
||||
def invalidate(self, tweet):
|
||||
"""
|
||||
Invalidates a tweet so that it is not recrawled by the counter
|
||||
and can be verified later
|
||||
"""
|
||||
tweet.invalid = True
|
||||
tweet.crawled = True
|
||||
session.add(tweet)
|
||||
|
||||
self.cpt += 1 # indicates that we have a candidiate for the flushing
|
||||
|
||||
def update(self, session, member, tweet):
|
||||
"""
|
||||
Updates member values.
|
||||
|
||||
@@ -17,5 +17,7 @@ root = '/home/jll/Documents/code/twitterboard/'
|
||||
# TODO: do that correctly
|
||||
|
||||
engine_url = 'sqlite:///twiderboard.db'
|
||||
|
||||
|
||||
log_name = 'board.log'
|
||||
log_path = os.path.join(root, log_name)
|
||||
|
||||
@@ -78,6 +78,9 @@ class Tweet(Base):
|
||||
crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already
|
||||
source = Column(String) # Where tweet comes from
|
||||
|
||||
# Boolean that is set to True if Tweet cannot be processed correctly
|
||||
invalid = Column(Boolean)
|
||||
|
||||
def __init__(self, author, created, inserted, crawled, source, text):
|
||||
self.eu = EncodingUtils() # used to switch to unicode
|
||||
|
||||
@@ -91,7 +94,8 @@ class Tweet(Base):
|
||||
|
||||
self.hashtags = self.extract_hashtags()
|
||||
|
||||
# BETTER, but see how it works
|
||||
self.invalid = False # cannot be invalid by default
|
||||
|
||||
def extract_hashtags(self):
|
||||
"""
|
||||
Extracts all the hashtags that are present in the tweet
|
||||
@@ -127,4 +131,4 @@ class Tweet(Base):
|
||||
try:
|
||||
return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
|
||||
except UnicodeDecodeError:
|
||||
return "Contains Unicode!!"
|
||||
return "Contains Unicode!!"
|
||||
|
||||
Reference in New Issue
Block a user