Skip to content

Commit 73c451d

Browse files
committed
Make the database settings configurable.
Also, switch to `with` statements for cleanup and improve error handling.
1 parent 3a13384 commit 73c451d

File tree

2 files changed

+193
-129
lines changed

2 files changed

+193
-129
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,16 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
1818
- Execute in the current folder (in parallel, if desired):
1919
- `python load_into_pg.py Badges`
2020
- `python load_into_pg.py Posts`
21-
- `python load_into_pg.py Tags`
21+
- `python load_into_pg.py Tags` (only present in later dumps)
2222
- `python load_into_pg.py Users`
2323
- `python load_into_pg.py Votes`
2424
- Finally, after all the initial tables have been created:
2525
- `psql stackoverflow < ./sql/final_post.sql`
2626

2727
## Caveats and TODOs
2828

29-
- It prepares some indexes which may not be necessary for your analysis.
29+
- It prepares some indexes and views which may not be necessary for your analysis.
3030
- The `body` field in `Posts` table is NOT populated.
31+
- The `emailhash` field in `Users` table is NOT populated.
3132
- Some tables (e.g. `PostHistory` and `Comments`) are missing.
32-
- The database settings are not configurable.
3333

load_into_pg.py

Lines changed: 190 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
#!/usr/bin/env python
22
import sys
33
import time
4+
import argparse
45
import psycopg2 as pg
56
import row_processor as Processor
67

78
def show_help():
8-
print "Usage: " + sys.argv[0] + " <Users|Badges|Posts|Tags|Votes> "
9+
print "Usage: " + sys.argv[0] + " <Users|Badges|Posts|Tags|Votes> [XML data-file]"
910

1011
def _makeDefValues(keys):
1112
"""Returns a dictionary containing None for all keys."""
@@ -26,133 +27,196 @@ def _createCmdTuple(cursor, keys, templ, attribs):
2627
defs.update(attribs)
2728
return cursor.mogrify(templ, defs)
2829

29-
def handleTable(table, keys):
30+
def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
3031
"""Handle the table including the post/pre processing."""
31-
conn = pg.connect("dbname=stackoverflow")
32-
cur = conn.cursor()
33-
pre = file('./sql/' + table + '_pre.sql').read()
34-
post = file('./sql/' + table + '_post.sql').read()
35-
36-
xml = file(table + '.xml')
37-
tmpl = _createMogrificationTemplate(keys)
38-
39-
start_time = time.time()
40-
41-
# Pre-processing (dropping/creation of tables)
42-
print 'Pre-processing ...'
43-
if pre != '':
44-
cur.execute(pre)
45-
conn.commit()
46-
print 'Pre-processing took {} seconds'.format(time.time() - start_time)
47-
48-
# Handle content of the table
49-
start_time = time.time()
50-
print 'Processing data ...'
51-
for rows in Processor.batch(Processor.parse(xml), 500):
52-
valuesStr = ',\n'.join(
53-
[ _createCmdTuple(cur, keys, tmpl, row_attribs)
54-
for row_attribs in rows
55-
]
56-
)
57-
58-
if len(valuesStr) > 0:
59-
cmd = 'INSERT INTO ' + table + \
60-
' VALUES\n' + valuesStr + ';'
61-
cur.execute(cmd)
62-
conn.commit()
63-
print 'Table processing took {} seconds'.format(time.time() - start_time)
64-
65-
# Post-processing (creation of indexes)
32+
dbFile = mbDbFile if mbDbFile is not None else table + '.xml'
33+
tmpl = _createMogrificationTemplate(keys)
6634
start_time = time.time()
67-
print 'Post processing ...'
68-
if post != '':
69-
cur.execute(post)
70-
conn.commit()
71-
print 'Post processing took {} seconds'.format(time.time() - start_time)
72-
73-
# Clean up
74-
cur.close()
75-
conn.close()
76-
7735

78-
if len(sys.argv) < 2:
79-
show_help()
36+
try:
37+
pre = file('./sql/' + table + '_pre.sql').read()
38+
post = file('./sql/' + table + '_post.sql').read()
39+
except IOError as e:
40+
print >> sys.stderr, "Could not load pre/post sql. Are you running from the correct path?"
41+
sys.exit(-1)
42+
43+
dbConnectionParam = "dbname={}".format(dbname)
44+
45+
if mbPort is not None:
46+
dbConnectionParam += ' port={}'.format(mbPort)
47+
48+
if mbHost is not None:
49+
dbConnectionParam += ' host={}'.format(mbHost)
50+
51+
# TODO Is the escaping done here correct?
52+
if mbUsername is not None:
53+
dbConnectionParam += ' username={}'.format(mbUsername)
54+
55+
# TODO Is the escaping done here correct?
56+
if mbPassword is not None:
57+
dbConnectionParam += ' password={}'.format(mbPassword)
58+
59+
try:
60+
with pg.connect(dbConnectionParam) as conn:
61+
with conn.cursor() as cur:
62+
try:
63+
with open(dbFile) as xml:
64+
# Pre-processing (dropping/creation of tables)
65+
print 'Pre-processing ...'
66+
if pre != '':
67+
cur.execute(pre)
68+
conn.commit()
69+
print 'Pre-processing took {} seconds'.format(time.time() - start_time)
70+
71+
# Handle content of the table
72+
start_time = time.time()
73+
print 'Processing data ...'
74+
for rows in Processor.batch(Processor.parse(xml), 500):
75+
valuesStr = ',\n'.join(
76+
[ _createCmdTuple(cur, keys, tmpl, row_attribs)
77+
for row_attribs in rows
78+
]
79+
)
80+
81+
if len(valuesStr) > 0:
82+
cmd = 'INSERT INTO ' + table + \
83+
' VALUES\n' + valuesStr + ';'
84+
cur.execute(cmd)
85+
conn.commit()
86+
print 'Table processing took {} seconds'.format(time.time() - start_time)
87+
88+
# Post-processing (creation of indexes)
89+
start_time = time.time()
90+
print 'Post processing ...'
91+
if post != '':
92+
cur.execute(post)
93+
conn.commit()
94+
print 'Post processing took {} seconds'.format(time.time() - start_time)
95+
96+
except IOError as e:
97+
print >> sys.stderr, "Could not read from file {}.".format(dbFile)
98+
print >> sys.stderr, "IOError ({0}): {1}".format(e.errorno, e.strerror)
99+
except pg.Error as e:
100+
print >> sys.stderr, "Error in dealing with the database."
101+
print >> sys.stderr, "pg.Error ({0}): {1}".format(e.pgcode, e.pgerror)
102+
except pg.Warning as w:
103+
print >> sys.stderr, "Warning from the database."
104+
print >> sys.stderr, "pg.Warning ({0}): ".format(str(w))
105+
106+
107+
108+
#############################################################
109+
110+
parser = argparse.ArgumentParser()
111+
parser.add_argument( 'table'
112+
, help = 'The table to work on.'
113+
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes']
114+
)
115+
116+
parser.add_argument( '-d', '--dbname'
117+
, help = 'Name of database to create the table in. The database must exist.'
118+
, default = 'stackoverflow'
119+
)
120+
121+
parser.add_argument( '-f', '--file'
122+
, help = 'Name of the file to extract data from.'
123+
, default = None
124+
)
125+
126+
parser.add_argument( '-u', '--username'
127+
, help = 'Username for the database.'
128+
, default = None
129+
)
130+
131+
parser.add_argument( '-p', '--password'
132+
, help = 'Password for the database.'
133+
, default = None
134+
)
135+
136+
parser.add_argument( '-P', '--port'
137+
, help = 'Port to connect with the database on.'
138+
, default = None
139+
)
140+
141+
parser.add_argument( '-H', '--host'
142+
, help = 'Hostname for the database.'
143+
, default = None
144+
)
145+
146+
args = parser.parse_args()
147+
148+
table = args.table
149+
keys = None
150+
151+
if table == 'Users':
152+
keys = [
153+
'Id'
154+
, 'Reputation'
155+
, 'CreationDate'
156+
, 'DisplayName'
157+
, 'LastAccessDate'
158+
, 'WebsiteUrl'
159+
, 'Location'
160+
, 'AboutMe'
161+
, 'Views'
162+
, 'UpVotes'
163+
, 'DownVotes'
164+
, 'ProfileImageUrl'
165+
, 'Age'
166+
, 'AccountId'
167+
]
168+
elif table == 'Badges':
169+
keys = [
170+
'Id'
171+
, 'UserId'
172+
, 'Name'
173+
, 'Date'
174+
]
175+
elif table == 'Votes':
176+
keys = [
177+
'Id'
178+
, 'PostId'
179+
, 'VoteTypeId'
180+
, 'UserId'
181+
, 'CreationDate'
182+
, 'BountyAmount'
183+
]
184+
elif table == 'Posts':
185+
keys = [
186+
'Id'
187+
, 'PostTypeId'
188+
, 'AcceptedAnswerId'
189+
, 'ParentId'
190+
, 'CreationDate'
191+
, 'Score'
192+
, 'ViewCount'
193+
# , 'Body'
194+
, 'OwnerUserId'
195+
, 'LastEditorUserId'
196+
, 'LastEditorDisplayName'
197+
, 'LastEditDate'
198+
, 'LastActivityDate'
199+
, 'Title'
200+
, 'Tags'
201+
, 'AnswerCount'
202+
, 'CommentCount'
203+
, 'FavoriteCount'
204+
, 'ClosedDate'
205+
, 'CommunityOwnedDate'
206+
]
207+
elif table == 'Tags':
208+
keys = [
209+
'Id'
210+
, 'TagName'
211+
, 'Count'
212+
, 'ExcerptPostId'
213+
, 'WikiPostId'
214+
]
215+
216+
choice = raw_input('This will drop the {} table. Are you sure [y/n]?'.format(table))
217+
218+
if len(choice) > 0 and choice[0].lower() == 'y':
219+
handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password)
80220
else:
81-
table = sys.argv[1]
82-
keys = None
83-
84-
if table == 'Users':
85-
keys = [
86-
'Id'
87-
, 'Reputation'
88-
, 'CreationDate'
89-
, 'DisplayName'
90-
, 'LastAccessDate'
91-
, 'WebsiteUrl'
92-
, 'Location'
93-
, 'AboutMe'
94-
, 'Views'
95-
, 'UpVotes'
96-
, 'DownVotes'
97-
, 'ProfileImageUrl'
98-
, 'Age'
99-
, 'AccountId'
100-
]
101-
elif table == 'Badges':
102-
keys = [
103-
'Id'
104-
, 'UserId'
105-
, 'Name'
106-
, 'Date'
107-
]
108-
elif table == 'Votes':
109-
keys = [
110-
'Id'
111-
, 'PostId'
112-
, 'VoteTypeId'
113-
, 'UserId'
114-
, 'CreationDate'
115-
, 'BountyAmount'
116-
]
117-
elif table == 'Posts':
118-
keys = [
119-
'Id'
120-
, 'PostTypeId'
121-
, 'AcceptedAnswerId'
122-
, 'ParentId'
123-
, 'CreationDate'
124-
, 'Score'
125-
, 'ViewCount'
126-
# , 'Body'
127-
, 'OwnerUserId'
128-
, 'LastEditorUserId'
129-
, 'LastEditorDisplayName'
130-
, 'LastEditDate'
131-
, 'LastActivityDate'
132-
, 'Title'
133-
, 'Tags'
134-
, 'AnswerCount'
135-
, 'CommentCount'
136-
, 'FavoriteCount'
137-
, 'ClosedDate'
138-
, 'CommunityOwnedDate'
139-
]
140-
elif table == 'Tags':
141-
keys = [
142-
'Id'
143-
, 'TagName'
144-
, 'Count'
145-
, 'ExcerptPostId'
146-
, 'WikiPostId'
147-
]
148-
149-
if keys is None:
150-
show_help()
151-
else:
152-
choice = raw_input('This will drop the {} table. Are you sure [y/n]?'.format(table))
153-
154-
if len(choice) > 0 and choice[0].lower() == 'y':
155-
handleTable(table, keys)
156-
else:
157-
print "Cancelled"
221+
print "Cancelled."
158222

0 commit comments

Comments
 (0)