mirror of
https://github.com/jlengrand/twitterboard.git
synced 2026-03-10 08:51:22 +00:00
Adds a new column to tweets in db : invalid
Will be used to set problematic as crawled and try to correct them later on. This avoids having to process them over and over. I might have to think about a better way to perform database updates if I do it often.
This commit is contained in:
0
db_changes/__init__.py
Normal file
0
db_changes/__init__.py
Normal file
17
db_changes/v01.py
Normal file
17
db_changes/v01.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Changes made between v00 and v01
|
||||||
|
|
||||||
|
Operation is :
|
||||||
|
- Add new column of type boolean and name invalid in Tweet table
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
db_name = '/home/jll/Documents/code/twitterboard/twiderboard.db'
|
||||||
|
|
||||||
|
con = sqlite3.connect(db_name)
|
||||||
|
c = con.cursor()
|
||||||
|
c.execute("ALTER TABLE tweets ADD COLUMN 'invalid' BOOLEAN")
|
||||||
|
con.commit()
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
print "database updated to v01"
|
||||||
BIN
twiderboard.db
BIN
twiderboard.db
Binary file not shown.
@@ -129,10 +129,23 @@ class Counter():
|
|||||||
self.logger.error("ElementException : More than one member found !")
|
self.logger.error("ElementException : More than one member found !")
|
||||||
raise ElementException # FIXME : Take care
|
raise ElementException # FIXME : Take care
|
||||||
|
|
||||||
self.flush(session)
|
|
||||||
except ElementException:
|
except ElementException:
|
||||||
|
self.invalidate(Tweet)
|
||||||
self.logger.error("ElementException : Could not process %s !" % (tweet))
|
self.logger.error("ElementException : Could not process %s !" % (tweet))
|
||||||
|
|
||||||
|
self.flush(session)
|
||||||
|
|
||||||
|
def invalidate(self, tweet):
|
||||||
|
"""
|
||||||
|
Invalidates a tweet so that it is not recrawled by the counter
|
||||||
|
and can be verified later
|
||||||
|
"""
|
||||||
|
tweet.invalid = True
|
||||||
|
tweet.crawled = True
|
||||||
|
session.add(tweet)
|
||||||
|
|
||||||
|
self.cpt += 1 # indicates that we have a candidiate for the flushing
|
||||||
|
|
||||||
def update(self, session, member, tweet):
|
def update(self, session, member, tweet):
|
||||||
"""
|
"""
|
||||||
Updates member values.
|
Updates member values.
|
||||||
|
|||||||
@@ -17,5 +17,7 @@ root = '/home/jll/Documents/code/twitterboard/'
|
|||||||
# TODO: do that correctly
|
# TODO: do that correctly
|
||||||
|
|
||||||
engine_url = 'sqlite:///twiderboard.db'
|
engine_url = 'sqlite:///twiderboard.db'
|
||||||
|
|
||||||
|
|
||||||
log_name = 'board.log'
|
log_name = 'board.log'
|
||||||
log_path = os.path.join(root, log_name)
|
log_path = os.path.join(root, log_name)
|
||||||
|
|||||||
@@ -78,6 +78,9 @@ class Tweet(Base):
|
|||||||
crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already
|
crawled = Column(Boolean) # Boolean whether or not tweet is in statistics already
|
||||||
source = Column(String) # Where tweet comes from
|
source = Column(String) # Where tweet comes from
|
||||||
|
|
||||||
|
# Boolean that is set to True if Tweet cannot be processed correctly
|
||||||
|
invalid = Column(Boolean)
|
||||||
|
|
||||||
def __init__(self, author, created, inserted, crawled, source, text):
|
def __init__(self, author, created, inserted, crawled, source, text):
|
||||||
self.eu = EncodingUtils() # used to switch to unicode
|
self.eu = EncodingUtils() # used to switch to unicode
|
||||||
|
|
||||||
@@ -91,7 +94,8 @@ class Tweet(Base):
|
|||||||
|
|
||||||
self.hashtags = self.extract_hashtags()
|
self.hashtags = self.extract_hashtags()
|
||||||
|
|
||||||
# BETTER, but see how it works
|
self.invalid = False # cannot be invalid by default
|
||||||
|
|
||||||
def extract_hashtags(self):
|
def extract_hashtags(self):
|
||||||
"""
|
"""
|
||||||
Extracts all the hashtags that are present in the tweet
|
Extracts all the hashtags that are present in the tweet
|
||||||
@@ -127,4 +131,4 @@ class Tweet(Base):
|
|||||||
try:
|
try:
|
||||||
return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
|
return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
return "Contains Unicode!!"
|
return "Contains Unicode!!"
|
||||||
|
|||||||
Reference in New Issue
Block a user