From db298e685891330fbdec6b93102c98d1a4a43523 Mon Sep 17 00:00:00 2001 From: madtibo Date: Thu, 16 Aug 2018 12:30:17 +0200 Subject: [PATCH 1/9] download and load a complete stackexchange project using the '-s' switch, download the compressed file from _https://ia800107.us.archive.org/27/items/stackexchange/_, then, uncompress it and load all the files in the database. Add a '-n' switch to move the tables to a given schema WARNING: since using the urllib.request module, set the script to use python3 --- README.md | 36 ++++++--- load_into_pg.py | 181 ++++++++++++++++++++++++++++++++++++++-------- sql/Votes_pre.sql | 2 +- 3 files changed, 178 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 26ccbf4..6b021c5 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,14 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch `Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`. - In some old dumps, the cases in the filenames are different. - Execute in the current folder (in parallel, if desired): - - `python load_into_pg.py Badges` - - `python load_into_pg.py Posts` - - `python load_into_pg.py Tags` (not present in earliest dumps) - - `python load_into_pg.py Users` - - `python load_into_pg.py Votes` - - `python load_into_pg.py PostLinks` - - `python load_into_pg.py PostHistory` - - `python load_into_pg.py Comments` + - `python load_into_pg.py -t Badges` + - `python load_into_pg.py -t Posts` + - `python load_into_pg.py -t Tags` (not present in earliest dumps) + - `python load_into_pg.py -t Users` + - `python load_into_pg.py -t Votes` + - `python load_into_pg.py -t PostLinks` + - `python load_into_pg.py -t PostHistory` + - `python load_into_pg.py -t Comments` - Finally, after all the initial tables have been created: - `psql stackoverflow < ./sql/final_post.sql` - If you used a different database name, make sure to use that instead of @@ -34,7 +34,25 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch - `psql stackoverflow < ./sql/optional_post.sql` - Again, remember to user the correct database name here, if not `stackoverflow`. -## Caveats +## Loading a complete stackexchange project + +You can use the script to download a given stackexchange compressed file from +[archive.org](https://ia800107.us.archive.org/27/items/stackexchange/) and load +all the tables at once, using the `-s` switch. + +You will need the `urllib` and `libarchive` modules. + +If you give a schema name using the `-n` switch, all the tables will be moved +to the given schema. This schema will be created in the script. + +To load the _dba.stackexchange.com_ project in the `dba` schema, you would execute: +`./load_into_pg.py -s dba -n dba` + +The paths are not changed in the final scripts `sql/final_post.sql` and +`sql/optional_post.sql`. To run them, first set the _search_path_ to your +schema name: `SET search_path TO ;` + +## Caveats and TODOs - It prepares some indexes and views which may not be necessary for your analysis. - The `Body` field in `Posts` table is NOT populated by default. You have to use `--with-post-body` argument to include it. diff --git a/load_into_pg.py b/load_into_pg.py index 66b651d..ba28cf3 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -1,8 +1,9 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import time import argparse import psycopg2 as pg +import os import row_processor as Processor import six import json @@ -12,6 +13,51 @@ ('Posts', 'ViewCount'): "NULLIF(%(ViewCount)s, '')::int" } +# part of the file already downloaded +file_part = None + +def show_progress(block_num, block_size, total_size): + """Display the total size of the file to download and the progess in percent""" + global file_part + if file_part is None: + suffixes=['B','KB','MB','GB','TB'] + suffixIndex = 0 + pp_size = total_size + while pp_size > 1024: + suffixIndex += 1 #increment the index of the suffix + pp_size = pp_size/1024.0 #apply the division + six.print_('Total file size is: {0:.1f} {1}'.format(pp_size,suffixes[suffixIndex])) + six.print_("0 % of the file downloaded ...\r", end="", flush=True) + file_part = 0 + + downloaded = block_num * block_size + if downloaded < total_size: + percent = 100 * downloaded / total_size + if percent - file_part > 1: + file_part = percent + six.print_("{0} % of the file downloaded ...\r".format(int(percent)), end="", flush=True) + else: + file_part = None + six.print_("") + +def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword): + dbConnectionParam = "dbname={}".format(dbname) + + if mbPort is not None: + dbConnectionParam += ' port={}'.format(mbPort) + + if mbHost is not None: + dbConnectionParam += ' host={}'.format(mbHost) + + # TODO Is the escaping done here correct? + if mbUsername is not None: + dbConnectionParam += ' user={}'.format(mbUsername) + + # TODO Is the escaping done here correct? + if mbPassword is not None: + dbConnectionParam += ' password={}'.format(mbPassword) + return dbConnectionParam + def _makeDefValues(keys): """Returns a dictionary containing None for all keys.""" return dict(( (k, None) for k in keys )) @@ -150,7 +196,7 @@ def _getTableKeys(table): ] return keys -def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): +def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): """Handle the table including the post/pre processing.""" keys = _getTableKeys(table) dbFile = mbDbFile if mbDbFile is not None else table + '.xml' @@ -165,23 +211,6 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr) sys.exit(-1) - dbConnectionParam = "dbname={}".format(dbname) - - if mbPort is not None: - dbConnectionParam += ' port={}'.format(mbPort) - - if mbHost is not None: - dbConnectionParam += ' host={}'.format(mbHost) - - # TODO Is the escaping done here correct? - if mbUsername is not None: - dbConnectionParam += ' user={}'.format(mbUsername) - - # TODO Is the escaping done here correct? - if mbPassword is not None: - dbConnectionParam += ' password={}'.format(mbPassword) - - try: with pg.connect(dbConnectionParam) as conn: with conn.cursor() as cur: @@ -208,7 +237,7 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m ' VALUES\n' + valuesStr + ';' cur.execute(cmd) conn.commit() - six.print_('Table {0} processing took {1:.1f} seconds'.format(table, time.time() - start_time)) + six.print_('Table \'{0}\' processing took {1:.1f} seconds'.format(table, time.time() - start_time)) # Post-processing (creation of indexes) start_time = time.time() @@ -237,12 +266,32 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m six.print_("Warning from the database.", file=sys.stderr) six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr) + +def moveTableToSchema(table, schemaName, dbConnectionParam): + try: + with pg.connect(dbConnectionParam) as conn: + with conn.cursor() as cur: + # create the schema + cur.execute('CREATE SCHEMA IF NOT EXISTS '+schemaName+';') + conn.commit() + # move the table to the right schema + cur.execute('ALTER TABLE '+table+' SET SCHEMA '+schemaName+';') + conn.commit() + except pg.Error as e: + six.print_("Error in dealing with the database.", file=sys.stderr) + six.print_("pg.Error ({0}): {1}".format(e.pgcode, e.pgerror), file=sys.stderr) + six.print_(str(e), file=sys.stderr) + except pg.Warning as w: + six.print_("Warning from the database.", file=sys.stderr) + six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr) + ############################################################# parser = argparse.ArgumentParser() -parser.add_argument( 'table' +parser.add_argument( '-t', '--table' , help = 'The table to work on.' , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments'] + , default = None ) parser.add_argument( '-d', '--dbname' @@ -255,6 +304,22 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m , default = None ) +parser.add_argument( '-s', '--so-project' + , help = 'stackexchange project to load.' + , default = None + ) + +parser.add_argument( '--archive-url' + , help = 'URL of the archive directory to retrieve.' + , default = 'https://ia800107.us.archive.org/27/items/stackexchange' + ) + +parser.add_argument( '-k', '--keep-archive' + , help = 'should we keep the downloaded archive.' + , action = 'store_true' + , default = False + ) + parser.add_argument( '-u', '--username' , help = 'Username for the database.' , default = None @@ -287,6 +352,11 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m , default = False ) +parser.add_argument( '-n', '--schema-name' + , help = 'Use specific schema.' + , default = 'public' + ) + parser.add_argument( '--foreign-keys' , help = 'Create foreign keys.' , action = 'store_true' @@ -295,22 +365,71 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m args = parser.parse_args() -table = args.table - try: # Python 2/3 compatibility input = raw_input except NameError: pass +dbConnectionParam = buildConnectionString(args.dbname, args.host, args.port, args.username, args.password) + +# load given file in table +if args.file and args.table: + table = args.table + + if table == 'Posts': + # If the user has not explicitly asked for loading the body, we replace it with NULL + if not args.with_post_body: + specialRules[('Posts', 'Body')] = 'NULL' + + choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) + if len(choice) > 0 and choice[0].lower() == 'y': + handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam) + else: + six.print_("Cancelled.") + if args.schema_name != 'public': + moveTableToSchema(table, args.schema_name, dbConnectionParam) + exit(0) + +# load a project +elif args.so_project: + import urllib.request + import libarchive + + # download the 7z archive in /tmp + file_name = args.so_project + '.stackexchange.com.7z' + url = '{0}/{1}'.format(args.archive_url, file_name) + filepath = '/tmp/'+file_name + six.print_('Downloading the archive, please be patient ...') + try: + urllib.request.urlretrieve(url, filepath, show_progress) + except Exception as e: + six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e)) + exit(1) + + try: + libarchive.extract_file(filepath) + except Exception as e: + six.print_('Error: impossible to extract the {0} archive ({1})'.format(url, e)) + exit(1) + + tables = [ 'Tags', 'Users', 'Badges', 'Posts', 'Comments', 'Votes', 'PostLinks', 'PostHistory' ] + + for table in tables: + six.print_('Load {0}.xml file'.format(table)) + handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam) + # remove file + os.remove(table+'.xml') + + if not args.keep_archive: + # remove archive + os.remove(filepath) -if table == 'Posts': - # If the user has not explicitly asked for loading the body, we replace it with NULL - if not args.with_post_body: - specialRules[('Posts', 'Body')] = 'NULL' + if args.schema_name != 'public': + for table in tables: + moveTableToSchema(table, args.schema_name, dbConnectionParam) + exit(0) -choice = input('This will drop the {} table. Are you sure [y/n]? '.format(table)) -if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, args.insert_json, args.foreign_keys, args.dbname, args.file, args.host, args.port, args.username, args.password) else: - six.print_("Cancelled.") + six.print_("Error: you must either use '-f' and '-t' arguments or the '-s' argument.") + parser.print_help() diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql index 29aebe0..3ed0b53 100644 --- a/sql/Votes_pre.sql +++ b/sql/Votes_pre.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS Votes CASCADE; CREATE TABLE Votes ( Id int PRIMARY KEY , - PostId int not NULL , + PostId int , -- not NULL , VoteTypeId int not NULL , UserId int , CreationDate timestamp not NULL , From fc4dc261bbb3501ffd2c1cfdffb5e66438bc58d3 Mon Sep 17 00:00:00 2001 From: madtibo Date: Tue, 12 Feb 2019 18:08:26 +0100 Subject: [PATCH 2/9] make it possible to keep downloaded archive --- load_into_pg.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index ba28cf3..45a6a13 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys import time import argparse @@ -245,7 +245,7 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): if post != '': cur.execute(post) conn.commit() - six.print_('Post processing took {} seconds'.format(time.time() - start_time)) + six.print_('Post processing took {0:.1f} seconds'.format(time.time() - start_time)) if createFk: # fk-processing (creation of foreign keys) start_time = time.time() @@ -253,7 +253,7 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): if post != '': cur.execute(fk) conn.commit() - six.print_('fk processing took {} seconds'.format(time.time() - start_time)) + six.print_('fk processing took {0:.1f} seconds'.format(time.time() - start_time)) except IOError as e: six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr) @@ -393,19 +393,22 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): # load a project elif args.so_project: - import urllib.request import libarchive - # download the 7z archive in /tmp - file_name = args.so_project + '.stackexchange.com.7z' - url = '{0}/{1}'.format(args.archive_url, file_name) - filepath = '/tmp/'+file_name - six.print_('Downloading the archive, please be patient ...') - try: - urllib.request.urlretrieve(url, filepath, show_progress) - except Exception as e: - six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e)) - exit(1) + filepath = None + if args.file: + filepath = args.file + else: + # download the 7z archive in /tmp + file_name = args.so_project + '.stackexchange.com.7z' + url = '{0}/{1}'.format(args.archive_url, file_name) + filepath = '/tmp/'+file_name + six.print_('Downloading the archive, please be patient ...') + try: + six.moves.urllib.request.urlretrieve(url, filepath, show_progress) + except Exception as e: + six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e)) + exit(1) try: libarchive.extract_file(filepath) @@ -417,7 +420,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): for table in tables: six.print_('Load {0}.xml file'.format(table)) - handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam) + handleTable(table, args.insert_json, args.foreign_keys, None, dbConnectionParam) # remove file os.remove(table+'.xml') From 2b39ef6e65caad859eb3c4f7cd698e098de2b309 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay <502876+musically-ut@users.noreply.github.com> Date: Thu, 11 Apr 2019 18:12:08 +0200 Subject: [PATCH 3/9] Minor spacing fixes. --- load_into_pg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index 45a6a13..ca8e34a 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -272,10 +272,10 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): with pg.connect(dbConnectionParam) as conn: with conn.cursor() as cur: # create the schema - cur.execute('CREATE SCHEMA IF NOT EXISTS '+schemaName+';') + cur.execute('CREATE SCHEMA IF NOT EXISTS ' + schemaName + ';') conn.commit() # move the table to the right schema - cur.execute('ALTER TABLE '+table+' SET SCHEMA '+schemaName+';') + cur.execute('ALTER TABLE '+table+' SET SCHEMA ' + schemaName + ';') conn.commit() except pg.Error as e: six.print_("Error in dealing with the database.", file=sys.stderr) @@ -305,7 +305,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): ) parser.add_argument( '-s', '--so-project' - , help = 'stackexchange project to load.' + , help = 'StackExchange project to load.' , default = None ) @@ -315,8 +315,8 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): ) parser.add_argument( '-k', '--keep-archive' - , help = 'should we keep the downloaded archive.' - , action = 'store_true' + , help = 'Will preserve the downloaded archive instead of deleting it.' + , action = 'store_true' , default = False ) From 22b0de5ef1bfe07792d857c8af17865a8cb84355 Mon Sep 17 00:00:00 2001 From: madtibo Date: Wed, 17 Apr 2019 12:09:31 +0200 Subject: [PATCH 4/9] store downloaded file in temporary dir using python tempfile library --- README.md | 2 +- load_into_pg.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6b021c5..d81dd89 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch ## Dependencies - [`lxml`](http://lxml.de/installation.html) - - [`psychopg2`](http://initd.org/psycopg/docs/install.html) + - [`psycopg2`](http://initd.org/psycopg/docs/install.html) ## Usage diff --git a/load_into_pg.py b/load_into_pg.py index ca8e34a..21e8dee 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -394,16 +394,23 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): # load a project elif args.so_project: import libarchive + import tempfile filepath = None if args.file: filepath = args.file + url = filepath else: - # download the 7z archive in /tmp + # download the 7z archive in tempdir file_name = args.so_project + '.stackexchange.com.7z' url = '{0}/{1}'.format(args.archive_url, file_name) - filepath = '/tmp/'+file_name - six.print_('Downloading the archive, please be patient ...') + temp_dir = tempfile.gettempdir() + if temp_dir == 'None': + six.print_('WARNING: Could not find temporary directory. Use current directory instead.') + temp_dir = os.getcwd() + filepath = os.path.join(temp_dir, file_name) + six.print_('Downloading the archive in {0}'.format(filepath)) + six.print_('please be patient ...') try: six.moves.urllib.request.urlretrieve(url, filepath, show_progress) except Exception as e: From d9e17353014dba890d4ebfb4da90485e89d491d3 Mon Sep 17 00:00:00 2001 From: madtibo Date: Fri, 26 Apr 2019 16:36:54 +0200 Subject: [PATCH 5/9] store in temp directory and remove it if needed --- load_into_pg.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index 21e8dee..3fedb7a 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -397,6 +397,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): import tempfile filepath = None + temp_dir = None if args.file: filepath = args.file url = filepath @@ -404,10 +405,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): # download the 7z archive in tempdir file_name = args.so_project + '.stackexchange.com.7z' url = '{0}/{1}'.format(args.archive_url, file_name) - temp_dir = tempfile.gettempdir() - if temp_dir == 'None': - six.print_('WARNING: Could not find temporary directory. Use current directory instead.') - temp_dir = os.getcwd() + temp_dir = tempfile.mkdtemp(prefix='so_') filepath = os.path.join(temp_dir, file_name) six.print_('Downloading the archive in {0}'.format(filepath)) six.print_('please be patient ...') @@ -432,8 +430,12 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): os.remove(table+'.xml') if not args.keep_archive: - # remove archive os.remove(filepath) + if temp_dir: + # remove the archive and the temporary directory + os.rmdir(temp_dir) + else: + six.print_("Archive '{0}' deleted".format(filepath)) if args.schema_name != 'public': for table in tables: From 588b74041fb3318b2f97c81803400d2c9ce14d91 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 2 May 2019 00:53:13 +0200 Subject: [PATCH 6/9] Adds libarchive as a dependency. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d81dd89..24146e2 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch - [`lxml`](http://lxml.de/installation.html) - [`psycopg2`](http://initd.org/psycopg/docs/install.html) + - [`libarchive`](https://www.libarchive.org/) ## Usage From 7bede10d79750bb2cfeb52a6048fb0aad8bd97ec Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 2 May 2019 15:27:04 +0200 Subject: [PATCH 7/9] Addresses the correct libarchive library. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 24146e2..36e9b2b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch - [`lxml`](http://lxml.de/installation.html) - [`psycopg2`](http://initd.org/psycopg/docs/install.html) - - [`libarchive`](https://www.libarchive.org/) + - [`libarchive-c`](https://pypi.org/project/libarchive-c/) ## Usage From dc79fdab7e20aff873f13425c0467ea52cc3068e Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 2 May 2019 19:38:20 +0200 Subject: [PATCH 8/9] Fixes spacing issues. --- load_into_pg.py | 103 ++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index 3fedb7a..c1b53aa 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + import sys import time import argparse @@ -16,17 +17,19 @@ # part of the file already downloaded file_part = None + def show_progress(block_num, block_size, total_size): - """Display the total size of the file to download and the progess in percent""" + """Display the total size of the file to download and the progress in percent""" global file_part if file_part is None: - suffixes=['B','KB','MB','GB','TB'] + suffixes = ['B', 'KB', 'MB', 'GB', 'TB'] suffixIndex = 0 pp_size = total_size while pp_size > 1024: - suffixIndex += 1 #increment the index of the suffix - pp_size = pp_size/1024.0 #apply the division - six.print_('Total file size is: {0:.1f} {1}'.format(pp_size,suffixes[suffixIndex])) + suffixIndex += 1 # Increment the index of the suffix + pp_size = pp_size / 1024.0 # Apply the division + six.print_('Total file size is: {0:.1f} {1}' + .format(pp_size, suffixes[suffixIndex])) six.print_("0 % of the file downloaded ...\r", end="", flush=True) file_part = 0 @@ -40,6 +43,7 @@ def show_progress(block_num, block_size, total_size): file_part = None six.print_("") + def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword): dbConnectionParam = "dbname={}".format(dbname) @@ -58,20 +62,23 @@ def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword): dbConnectionParam += ' password={}'.format(mbPassword) return dbConnectionParam + def _makeDefValues(keys): """Returns a dictionary containing None for all keys.""" - return dict(( (k, None) for k in keys )) + return dict(((k, None) for k in keys)) + def _createMogrificationTemplate(table, keys, insertJson): """Return the template string for mogrification for the given keys.""" - table_keys = ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules - else specialRules[table, k] - for k in keys ]) + table_keys = ', '.join(['%(' + k + ')s' if (table, k) not in specialRules + else specialRules[table, k] + for k in keys]) if insertJson: return ('(' + table_keys + ', %(jsonfield)s' + ')') else: return ('(' + table_keys + ')') + def _createCmdTuple(cursor, keys, templ, attribs, insertJson): """Use the cursor to mogrify a tuple of data. The passed data in `attribs` is augmented with default data (NULLs) and the @@ -83,14 +90,14 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson): defs.update(attribs) if insertJson: - dict_attribs = { } + dict_attribs = {} for name, value in attribs.items(): dict_attribs[name] = value defs['jsonfield'] = json.dumps(dict_attribs) - values_to_insert = cursor.mogrify(templ, defs) return cursor.mogrify(templ, defs) + def _getTableKeys(table): """Return an array of the keys for a given table""" keys = None @@ -177,25 +184,26 @@ def _getTableKeys(table): ] elif table == 'PostHistory': keys = [ - 'Id', - 'PostHistoryTypeId', - 'PostId', - 'RevisionGUID', - 'CreationDate', - 'UserId', - 'Text' + 'Id' + , 'PostHistoryTypeId' + , 'PostId' + , 'RevisionGUID' + , 'CreationDate' + , 'UserId' + , 'Text' ] elif table == 'Comments': keys = [ - 'Id', - 'PostId', - 'Score', - 'Text', - 'CreationDate', - 'UserId', + 'Id' + , 'PostId' + , 'Score' + , 'Text' + , 'CreationDate' + , 'UserId' ] return keys + def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): """Handle the table including the post/pre processing.""" keys = _getTableKeys(table) @@ -228,10 +236,10 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): six.print_('Processing data ...') for rows in Processor.batch(Processor.parse(xml), 500): valuesStr = ',\n'.join( - [ _createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8') - for row_attribs in rows - ] - ) + [_createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8') + for row_attribs in rows + ] + ) if len(valuesStr) > 0: cmd = 'INSERT INTO ' + table + \ ' VALUES\n' + valuesStr + ';' @@ -249,11 +257,11 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam): if createFk: # fk-processing (creation of foreign keys) start_time = time.time() - six.print_('fk processing ...') + six.print_('Foreign Key processing ...') if post != '': cur.execute(fk) conn.commit() - six.print_('fk processing took {0:.1f} seconds'.format(time.time() - start_time)) + six.print_('Foreign Key processing took {0:.1f} seconds'.format(time.time() - start_time)) except IOError as e: six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr) @@ -275,7 +283,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): cur.execute('CREATE SCHEMA IF NOT EXISTS ' + schemaName + ';') conn.commit() # move the table to the right schema - cur.execute('ALTER TABLE '+table+' SET SCHEMA ' + schemaName + ';') + cur.execute('ALTER TABLE ' + table + ' SET SCHEMA ' + schemaName + ';') conn.commit() except pg.Error as e: six.print_("Error in dealing with the database.", file=sys.stderr) @@ -288,76 +296,76 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): ############################################################# parser = argparse.ArgumentParser() -parser.add_argument( '-t', '--table' +parser.add_argument('-t', '--table' , help = 'The table to work on.' , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments'] , default = None ) -parser.add_argument( '-d', '--dbname' +parser.add_argument('-d', '--dbname' , help = 'Name of database to create the table in. The database must exist.' , default = 'stackoverflow' ) -parser.add_argument( '-f', '--file' +parser.add_argument('-f', '--file' , help = 'Name of the file to extract data from.' , default = None ) -parser.add_argument( '-s', '--so-project' +parser.add_argument('-s', '--so-project' , help = 'StackExchange project to load.' , default = None ) -parser.add_argument( '--archive-url' +parser.add_argument('--archive-url' , help = 'URL of the archive directory to retrieve.' , default = 'https://ia800107.us.archive.org/27/items/stackexchange' ) -parser.add_argument( '-k', '--keep-archive' +parser.add_argument('-k', '--keep-archive' , help = 'Will preserve the downloaded archive instead of deleting it.' , action = 'store_true' , default = False ) -parser.add_argument( '-u', '--username' +parser.add_argument('-u', '--username' , help = 'Username for the database.' , default = None ) -parser.add_argument( '-p', '--password' +parser.add_argument('-p', '--password' , help = 'Password for the database.' , default = None ) -parser.add_argument( '-P', '--port' +parser.add_argument('-P', '--port' , help = 'Port to connect with the database on.' , default = None ) -parser.add_argument( '-H', '--host' +parser.add_argument('-H', '--host' , help = 'Hostname for the database.' , default = None ) -parser.add_argument( '--with-post-body' +parser.add_argument('--with-post-body' , help = 'Import the posts with the post body. Only used if importing Posts.xml' , action = 'store_true' , default = False ) -parser.add_argument( '-j', '--insert-json' +parser.add_argument('-j', '--insert-json' , help = 'Insert raw data as JSON.' , action = 'store_true' , default = False ) -parser.add_argument( '-n', '--schema-name' +parser.add_argument('-n', '--schema-name' , help = 'Use specific schema.' , default = 'public' ) -parser.add_argument( '--foreign-keys' +parser.add_argument('--foreign-keys' , help = 'Create foreign keys.' , action = 'store_true' , default = False @@ -421,13 +429,14 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): six.print_('Error: impossible to extract the {0} archive ({1})'.format(url, e)) exit(1) - tables = [ 'Tags', 'Users', 'Badges', 'Posts', 'Comments', 'Votes', 'PostLinks', 'PostHistory' ] + tables = ['Tags', 'Users', 'Badges', 'Posts', 'Comments', + 'Votes', 'PostLinks', 'PostHistory'] for table in tables: six.print_('Load {0}.xml file'.format(table)) handleTable(table, args.insert_json, args.foreign_keys, None, dbConnectionParam) # remove file - os.remove(table+'.xml') + os.remove(table + '.xml') if not args.keep_archive: os.remove(filepath) From e0be1b2982d1d11e73cd37ed2536df204cab02a5 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 2 May 2019 19:47:26 +0200 Subject: [PATCH 9/9] Fix minor spacing. --- load_into_pg.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index c1b53aa..d7c3158 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -295,6 +295,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): ############################################################# + parser = argparse.ArgumentParser() parser.add_argument('-t', '--table' , help = 'The table to work on.' @@ -349,14 +350,14 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): ) parser.add_argument('--with-post-body' - , help = 'Import the posts with the post body. Only used if importing Posts.xml' - , action = 'store_true' + , help = 'Import the posts with the post body. Only used if importing Posts.xml' + , action = 'store_true' , default = False ) parser.add_argument('-j', '--insert-json' , help = 'Insert raw data as JSON.' - , action = 'store_true' + , action = 'store_true' , default = False ) @@ -367,7 +368,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam): parser.add_argument('--foreign-keys' , help = 'Create foreign keys.' - , action = 'store_true' + , action = 'store_true' , default = False )