11#!/usr/bin/env python
22import sys
33import time
4+ import argparse
45import psycopg2 as pg
56import row_processor as Processor
67
78def show_help ():
8- print "Usage: " + sys .argv [0 ] + " <Users|Badges|Posts|Tags|Votes> "
9+ print "Usage: " + sys .argv [0 ] + " <Users|Badges|Posts|Tags|Votes> [XML data-file] "
910
1011def _makeDefValues (keys ):
1112 """Returns a dictionary containing None for all keys."""
@@ -26,133 +27,196 @@ def _createCmdTuple(cursor, keys, templ, attribs):
2627 defs .update (attribs )
2728 return cursor .mogrify (templ , defs )
2829
29- def handleTable (table , keys ):
30+ def handleTable (table , keys , dbname , mbDbFile , mbHost , mbPort , mbUsername , mbPassword ):
3031 """Handle the table including the post/pre processing."""
31- conn = pg .connect ("dbname=stackoverflow" )
32- cur = conn .cursor ()
33- pre = file ('./sql/' + table + '_pre.sql' ).read ()
34- post = file ('./sql/' + table + '_post.sql' ).read ()
35-
36- xml = file (table + '.xml' )
37- tmpl = _createMogrificationTemplate (keys )
38-
39- start_time = time .time ()
40-
41- # Pre-processing (dropping/creation of tables)
42- print 'Pre-processing ...'
43- if pre != '' :
44- cur .execute (pre )
45- conn .commit ()
46- print 'Pre-processing took {} seconds' .format (time .time () - start_time )
47-
48- # Handle content of the table
49- start_time = time .time ()
50- print 'Processing data ...'
51- for rows in Processor .batch (Processor .parse (xml ), 500 ):
52- valuesStr = ',\n ' .join (
53- [ _createCmdTuple (cur , keys , tmpl , row_attribs )
54- for row_attribs in rows
55- ]
56- )
57-
58- if len (valuesStr ) > 0 :
59- cmd = 'INSERT INTO ' + table + \
60- ' VALUES\n ' + valuesStr + ';'
61- cur .execute (cmd )
62- conn .commit ()
63- print 'Table processing took {} seconds' .format (time .time () - start_time )
64-
65- # Post-processing (creation of indexes)
32+ dbFile = mbDbFile if mbDbFile is not None else table + '.xml'
33+ tmpl = _createMogrificationTemplate (keys )
6634 start_time = time .time ()
67- print 'Post processing ...'
68- if post != '' :
69- cur .execute (post )
70- conn .commit ()
71- print 'Post processing took {} seconds' .format (time .time () - start_time )
72-
73- # Clean up
74- cur .close ()
75- conn .close ()
76-
7735
78- if len (sys .argv ) < 2 :
79- show_help ()
36+ try :
37+ pre = file ('./sql/' + table + '_pre.sql' ).read ()
38+ post = file ('./sql/' + table + '_post.sql' ).read ()
39+ except IOError as e :
40+ print >> sys .stderr , "Could not load pre/post sql. Are you running from the correct path?"
41+ sys .exit (- 1 )
42+
43+ dbConnectionParam = "dbname={}" .format (dbname )
44+
45+ if mbPort is not None :
46+ dbConnectionParam += ' port={}' .format (mbPort )
47+
48+ if mbHost is not None :
49+ dbConnectionParam += ' host={}' .format (mbHost )
50+
51+ # TODO Is the escaping done here correct?
52+ if mbUsername is not None :
53+ dbConnectionParam += ' username={}' .format (mbUsername )
54+
55+ # TODO Is the escaping done here correct?
56+ if mbPassword is not None :
57+ dbConnectionParam += ' password={}' .format (mbPassword )
58+
59+ try :
60+ with pg .connect (dbConnectionParam ) as conn :
61+ with conn .cursor () as cur :
62+ try :
63+ with open (dbFile ) as xml :
64+ # Pre-processing (dropping/creation of tables)
65+ print 'Pre-processing ...'
66+ if pre != '' :
67+ cur .execute (pre )
68+ conn .commit ()
69+ print 'Pre-processing took {} seconds' .format (time .time () - start_time )
70+
71+ # Handle content of the table
72+ start_time = time .time ()
73+ print 'Processing data ...'
74+ for rows in Processor .batch (Processor .parse (xml ), 500 ):
75+ valuesStr = ',\n ' .join (
76+ [ _createCmdTuple (cur , keys , tmpl , row_attribs )
77+ for row_attribs in rows
78+ ]
79+ )
80+
81+ if len (valuesStr ) > 0 :
82+ cmd = 'INSERT INTO ' + table + \
83+ ' VALUES\n ' + valuesStr + ';'
84+ cur .execute (cmd )
85+ conn .commit ()
86+ print 'Table processing took {} seconds' .format (time .time () - start_time )
87+
88+ # Post-processing (creation of indexes)
89+ start_time = time .time ()
90+ print 'Post processing ...'
91+ if post != '' :
92+ cur .execute (post )
93+ conn .commit ()
94+ print 'Post processing took {} seconds' .format (time .time () - start_time )
95+
96+ except IOError as e :
97+ print >> sys .stderr , "Could not read from file {}." .format (dbFile )
98+ print >> sys .stderr , "IOError ({0}): {1}" .format (e .errorno , e .strerror )
99+ except pg .Error as e :
100+ print >> sys .stderr , "Error in dealing with the database."
101+ print >> sys .stderr , "pg.Error ({0}): {1}" .format (e .pgcode , e .pgerror )
102+ except pg .Warning as w :
103+ print >> sys .stderr , "Warning from the database."
104+ print >> sys .stderr , "pg.Warning ({0}): " .format (str (w ))
105+
106+
107+
108+ #############################################################
109+
110+ parser = argparse .ArgumentParser ()
111+ parser .add_argument ( 'table'
112+ , help = 'The table to work on.'
113+ , choices = ['Users' , 'Badges' , 'Posts' , 'Tags' , 'Votes' ]
114+ )
115+
116+ parser .add_argument ( '-d' , '--dbname'
117+ , help = 'Name of database to create the table in. The database must exist.'
118+ , default = 'stackoverflow'
119+ )
120+
121+ parser .add_argument ( '-f' , '--file'
122+ , help = 'Name of the file to extract data from.'
123+ , default = None
124+ )
125+
126+ parser .add_argument ( '-u' , '--username'
127+ , help = 'Username for the database.'
128+ , default = None
129+ )
130+
131+ parser .add_argument ( '-p' , '--password'
132+ , help = 'Password for the database.'
133+ , default = None
134+ )
135+
136+ parser .add_argument ( '-P' , '--port'
137+ , help = 'Port to connect with the database on.'
138+ , default = None
139+ )
140+
141+ parser .add_argument ( '-H' , '--host'
142+ , help = 'Hostname for the database.'
143+ , default = None
144+ )
145+
146+ args = parser .parse_args ()
147+
148+ table = args .table
149+ keys = None
150+
151+ if table == 'Users' :
152+ keys = [
153+ 'Id'
154+ , 'Reputation'
155+ , 'CreationDate'
156+ , 'DisplayName'
157+ , 'LastAccessDate'
158+ , 'WebsiteUrl'
159+ , 'Location'
160+ , 'AboutMe'
161+ , 'Views'
162+ , 'UpVotes'
163+ , 'DownVotes'
164+ , 'ProfileImageUrl'
165+ , 'Age'
166+ , 'AccountId'
167+ ]
168+ elif table == 'Badges' :
169+ keys = [
170+ 'Id'
171+ , 'UserId'
172+ , 'Name'
173+ , 'Date'
174+ ]
175+ elif table == 'Votes' :
176+ keys = [
177+ 'Id'
178+ , 'PostId'
179+ , 'VoteTypeId'
180+ , 'UserId'
181+ , 'CreationDate'
182+ , 'BountyAmount'
183+ ]
184+ elif table == 'Posts' :
185+ keys = [
186+ 'Id'
187+ , 'PostTypeId'
188+ , 'AcceptedAnswerId'
189+ , 'ParentId'
190+ , 'CreationDate'
191+ , 'Score'
192+ , 'ViewCount'
193+ # , 'Body'
194+ , 'OwnerUserId'
195+ , 'LastEditorUserId'
196+ , 'LastEditorDisplayName'
197+ , 'LastEditDate'
198+ , 'LastActivityDate'
199+ , 'Title'
200+ , 'Tags'
201+ , 'AnswerCount'
202+ , 'CommentCount'
203+ , 'FavoriteCount'
204+ , 'ClosedDate'
205+ , 'CommunityOwnedDate'
206+ ]
207+ elif table == 'Tags' :
208+ keys = [
209+ 'Id'
210+ , 'TagName'
211+ , 'Count'
212+ , 'ExcerptPostId'
213+ , 'WikiPostId'
214+ ]
215+
216+ choice = raw_input ('This will drop the {} table. Are you sure [y/n]?' .format (table ))
217+
218+ if len (choice ) > 0 and choice [0 ].lower () == 'y' :
219+ handleTable (table , keys , args .dbname , args .file , args .host , args .port , args .username , args .password )
80220else :
81- table = sys .argv [1 ]
82- keys = None
83-
84- if table == 'Users' :
85- keys = [
86- 'Id'
87- , 'Reputation'
88- , 'CreationDate'
89- , 'DisplayName'
90- , 'LastAccessDate'
91- , 'WebsiteUrl'
92- , 'Location'
93- , 'AboutMe'
94- , 'Views'
95- , 'UpVotes'
96- , 'DownVotes'
97- , 'ProfileImageUrl'
98- , 'Age'
99- , 'AccountId'
100- ]
101- elif table == 'Badges' :
102- keys = [
103- 'Id'
104- , 'UserId'
105- , 'Name'
106- , 'Date'
107- ]
108- elif table == 'Votes' :
109- keys = [
110- 'Id'
111- , 'PostId'
112- , 'VoteTypeId'
113- , 'UserId'
114- , 'CreationDate'
115- , 'BountyAmount'
116- ]
117- elif table == 'Posts' :
118- keys = [
119- 'Id'
120- , 'PostTypeId'
121- , 'AcceptedAnswerId'
122- , 'ParentId'
123- , 'CreationDate'
124- , 'Score'
125- , 'ViewCount'
126- # , 'Body'
127- , 'OwnerUserId'
128- , 'LastEditorUserId'
129- , 'LastEditorDisplayName'
130- , 'LastEditDate'
131- , 'LastActivityDate'
132- , 'Title'
133- , 'Tags'
134- , 'AnswerCount'
135- , 'CommentCount'
136- , 'FavoriteCount'
137- , 'ClosedDate'
138- , 'CommunityOwnedDate'
139- ]
140- elif table == 'Tags' :
141- keys = [
142- 'Id'
143- , 'TagName'
144- , 'Count'
145- , 'ExcerptPostId'
146- , 'WikiPostId'
147- ]
148-
149- if keys is None :
150- show_help ()
151- else :
152- choice = raw_input ('This will drop the {} table. Are you sure [y/n]?' .format (table ))
153-
154- if len (choice ) > 0 and choice [0 ].lower () == 'y' :
155- handleTable (table , keys )
156- else :
157- print "Cancelled"
221+ print "Cancelled."
158222
0 commit comments