From db298e685891330fbdec6b93102c98d1a4a43523 Mon Sep 17 00:00:00 2001
From: madtibo <madtibo_git@tribu-ml.fr>
Date: Thu, 16 Aug 2018 12:30:17 +0200
Subject: [PATCH 1/9] download and load a complete stackexchange project

using the '-s' switch, download the compressed file from _https://ia800107.us.archive.org/27/items/stackexchange/_, then, uncompress it and load all the files in the database. Add a '-n' switch to move the tables to a given schema

WARNING: since using the urllib.request module, set the script to use python3
---
 README.md         |  36 ++++++---
 load_into_pg.py   | 181 ++++++++++++++++++++++++++++++++++++++--------
 sql/Votes_pre.sql |   2 +-
 3 files changed, 178 insertions(+), 41 deletions(-)
diff --git a/README.md b/README.md
index 26ccbf4..6b021c5 100644
--- a/README.md
+++ b/README.md
@@ -18,14 +18,14 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
    `Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`.
    - In some old dumps, the cases in the filenames are different.
  - Execute in the current folder (in parallel, if desired):
-   - `python load_into_pg.py Badges`
-   - `python load_into_pg.py Posts`
-   - `python load_into_pg.py Tags` (not present in earliest dumps)
-   - `python load_into_pg.py Users`
-   - `python load_into_pg.py Votes`
-   - `python load_into_pg.py PostLinks`
-   - `python load_into_pg.py PostHistory`
-   - `python load_into_pg.py Comments`
+   - `python load_into_pg.py -t Badges`
+   - `python load_into_pg.py -t Posts`
+   - `python load_into_pg.py -t Tags` (not present in earliest dumps)
+   - `python load_into_pg.py -t Users`
+   - `python load_into_pg.py -t Votes`
+   - `python load_into_pg.py -t PostLinks`
+   - `python load_into_pg.py -t PostHistory`
+   - `python load_into_pg.py -t Comments`
  - Finally, after all the initial tables have been created:
    - `psql stackoverflow < ./sql/final_post.sql`
    - If you used a different database name, make sure to use that instead of
@@ -34,7 +34,25 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
    - `psql stackoverflow < ./sql/optional_post.sql`
    - Again, remember to user the correct database name here, if not `stackoverflow`.
 
-## Caveats
+## Loading a complete stackexchange project
+
+You can use the script to download a given stackexchange compressed file from
+[archive.org](https://ia800107.us.archive.org/27/items/stackexchange/) and load
+all the tables at once, using the `-s` switch.
+
+You will need the `urllib` and `libarchive` modules.
+
+If you give a schema name using the `-n` switch, all the tables will be moved
+to the given schema. This schema will be created in the script.
+
+To load the _dba.stackexchange.com_ project in the `dba` schema, you would execute:
+`./load_into_pg.py -s dba -n dba`
+
+The paths are not changed in the final scripts `sql/final_post.sql` and
+`sql/optional_post.sql`. To run them, first set the _search_path_ to your
+schema name: `SET search_path TO <myschema>;`
+
+## Caveats and TODOs
 
  - It prepares some indexes and views which may not be necessary for your analysis.
  - The `Body` field in `Posts` table is NOT populated by default. You have to use `--with-post-body` argument to include it.
diff --git a/load_into_pg.py b/load_into_pg.py
index 66b651d..ba28cf3 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -1,8 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import sys
 import time
 import argparse
 import psycopg2 as pg
+import os
 import row_processor as Processor
 import six
 import json
@@ -12,6 +13,51 @@
     ('Posts', 'ViewCount'): "NULLIF(%(ViewCount)s, '')::int"
 }
 
+# part of the file already downloaded
+file_part = None
+
+def show_progress(block_num, block_size, total_size):
+    """Display the total size of the file to download and the progess in percent"""
+    global file_part
+    if file_part is None:
+        suffixes=['B','KB','MB','GB','TB']
+        suffixIndex = 0
+        pp_size = total_size
+        while pp_size > 1024:
+            suffixIndex += 1 #increment the index of the suffix
+            pp_size = pp_size/1024.0 #apply the division
+        six.print_('Total file size is: {0:.1f} {1}'.format(pp_size,suffixes[suffixIndex]))
+        six.print_("0 % of the file downloaded ...\r", end="", flush=True)
+        file_part = 0
+
+    downloaded = block_num * block_size
+    if downloaded < total_size:
+        percent = 100 * downloaded / total_size
+        if percent - file_part > 1:
+            file_part = percent
+            six.print_("{0} % of the file downloaded ...\r".format(int(percent)), end="", flush=True)
+    else:
+        file_part = None
+        six.print_("")
+
+def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword):
+    dbConnectionParam = "dbname={}".format(dbname)
+
+    if mbPort is not None:
+        dbConnectionParam += ' port={}'.format(mbPort)
+
+    if mbHost is not None:
+        dbConnectionParam += ' host={}'.format(mbHost)
+
+    # TODO Is the escaping done here correct?
+    if mbUsername is not None:
+        dbConnectionParam += ' user={}'.format(mbUsername)
+
+    # TODO Is the escaping done here correct?
+    if mbPassword is not None:
+        dbConnectionParam += ' password={}'.format(mbPassword)
+    return dbConnectionParam
+
 def _makeDefValues(keys):
     """Returns a dictionary containing None for all keys."""
     return dict(( (k, None) for k in keys ))
@@ -150,7 +196,7 @@ def _getTableKeys(table):
         ]
     return keys
 
-def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
+def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
     """Handle the table including the post/pre processing."""
     keys       = _getTableKeys(table)
     dbFile     = mbDbFile if mbDbFile is not None else table + '.xml'
@@ -165,23 +211,6 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
         six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr)
         sys.exit(-1)
 
-    dbConnectionParam = "dbname={}".format(dbname)
-
-    if mbPort is not None:
-        dbConnectionParam += ' port={}'.format(mbPort)
-
-    if mbHost is not None:
-        dbConnectionParam += ' host={}'.format(mbHost)
-
-    # TODO Is the escaping done here correct?
-    if mbUsername is not None:
-        dbConnectionParam += ' user={}'.format(mbUsername)
-
-    # TODO Is the escaping done here correct?
-    if mbPassword is not None:
-        dbConnectionParam += ' password={}'.format(mbPassword)
-
-
     try:
         with pg.connect(dbConnectionParam) as conn:
             with conn.cursor() as cur:
@@ -208,7 +237,7 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
                                       ' VALUES\n' + valuesStr + ';'
                                 cur.execute(cmd)
                                 conn.commit()
-                        six.print_('Table {0} processing took {1:.1f} seconds'.format(table, time.time() - start_time))
+                        six.print_('Table \'{0}\' processing took {1:.1f} seconds'.format(table, time.time() - start_time))
 
                         # Post-processing (creation of indexes)
                         start_time = time.time()
@@ -237,12 +266,32 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
         six.print_("Warning from the database.", file=sys.stderr)
         six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)
 
+
+def moveTableToSchema(table, schemaName, dbConnectionParam):
+    try:
+        with pg.connect(dbConnectionParam) as conn:
+            with conn.cursor() as cur:
+                # create the schema
+                cur.execute('CREATE SCHEMA IF NOT EXISTS '+schemaName+';')
+                conn.commit()
+                # move the table to the right schema
+                cur.execute('ALTER TABLE '+table+' SET SCHEMA '+schemaName+';')
+                conn.commit()
+    except pg.Error as e:
+        six.print_("Error in dealing with the database.", file=sys.stderr)
+        six.print_("pg.Error ({0}): {1}".format(e.pgcode, e.pgerror), file=sys.stderr)
+        six.print_(str(e), file=sys.stderr)
+    except pg.Warning as w:
+        six.print_("Warning from the database.", file=sys.stderr)
+        six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)
+
 #############################################################
 
 parser = argparse.ArgumentParser()
-parser.add_argument( 'table'
+parser.add_argument( '-t', '--table'
                    , help    = 'The table to work on.'
                    , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments']
+                   , default = None
                    )
 
 parser.add_argument( '-d', '--dbname'
@@ -255,6 +304,22 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
                    , default = None
                    )
 
+parser.add_argument( '-s', '--so-project'
+                   , help    = 'stackexchange project to load.'
+                   , default = None
+                   )
+
+parser.add_argument( '--archive-url'
+                   , help    = 'URL of the archive directory to retrieve.'
+                   , default = 'https://ia800107.us.archive.org/27/items/stackexchange'
+                   )
+
+parser.add_argument( '-k', '--keep-archive'
+                   , help    = 'should we keep the downloaded archive.'
+                   , action = 'store_true'
+                   , default = False
+                   )
+
 parser.add_argument( '-u', '--username'
                    , help    = 'Username for the database.'
                    , default = None
@@ -287,6 +352,11 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
                    , default = False
                    )
 
+parser.add_argument( '-n', '--schema-name'
+                   , help    = 'Use specific schema.'
+                   , default = 'public'
+                   )
+
 parser.add_argument( '--foreign-keys'
                    , help    = 'Create foreign keys.'
                    , action = 'store_true'
@@ -295,22 +365,71 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m
 
 args = parser.parse_args()
 
-table = args.table
-
 try:
     # Python 2/3 compatibility
     input = raw_input
 except NameError:
     pass
 
+dbConnectionParam = buildConnectionString(args.dbname, args.host, args.port, args.username, args.password)
+
+# load given file in table
+if args.file and args.table:
+    table = args.table
+
+    if table == 'Posts':
+        # If the user has not explicitly asked for loading the body, we replace it with NULL
+        if not args.with_post_body:
+            specialRules[('Posts', 'Body')] = 'NULL'
+
+    choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table))
+    if len(choice) > 0 and choice[0].lower() == 'y':
+        handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam)
+    else:
+        six.print_("Cancelled.")
+    if args.schema_name != 'public':
+        moveTableToSchema(table, args.schema_name, dbConnectionParam)
+    exit(0)
+
+# load a project
+elif args.so_project:
+    import urllib.request
+    import libarchive
+
+    # download the 7z archive in /tmp
+    file_name = args.so_project + '.stackexchange.com.7z'
+    url = '{0}/{1}'.format(args.archive_url, file_name)
+    filepath = '/tmp/'+file_name
+    six.print_('Downloading the archive, please be patient ...')
+    try:
+        urllib.request.urlretrieve(url, filepath, show_progress)
+    except Exception as e:
+        six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e))
+        exit(1)
+
+    try:
+        libarchive.extract_file(filepath)
+    except Exception as e:
+        six.print_('Error: impossible to extract the {0} archive ({1})'.format(url, e))
+        exit(1)
+
+    tables = [ 'Tags', 'Users', 'Badges', 'Posts', 'Comments', 'Votes', 'PostLinks', 'PostHistory' ]
+
+    for table in tables:
+        six.print_('Load {0}.xml file'.format(table))
+        handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam)
+        # remove file
+        os.remove(table+'.xml')
+
+    if not args.keep_archive:
+        # remove archive
+        os.remove(filepath)
 
-if table == 'Posts':
-    # If the user has not explicitly asked for loading the body, we replace it with NULL
-    if not args.with_post_body:
-        specialRules[('Posts', 'Body')] = 'NULL'
+    if args.schema_name != 'public':
+        for table in tables:
+            moveTableToSchema(table, args.schema_name, dbConnectionParam)
+    exit(0)
 
-choice = input('This will drop the {} table. Are you sure [y/n]? '.format(table))
-if len(choice) > 0 and choice[0].lower() == 'y':
-    handleTable(table, args.insert_json, args.foreign_keys, args.dbname, args.file, args.host, args.port, args.username, args.password)
 else:
-    six.print_("Cancelled.")
+    six.print_("Error: you must either use '-f' and '-t'  arguments or the '-s' argument.")
+    parser.print_help()
diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql
index 29aebe0..3ed0b53 100644
--- a/sql/Votes_pre.sql
+++ b/sql/Votes_pre.sql
@@ -1,7 +1,7 @@
 DROP TABLE IF EXISTS Votes CASCADE;
 CREATE TABLE Votes (
    Id                int         PRIMARY KEY ,
-   PostId            int         not NULL    ,
+   PostId            int         , -- not NULL    ,
    VoteTypeId        int         not NULL    ,
    UserId            int                     ,
    CreationDate      timestamp   not NULL    ,

From fc4dc261bbb3501ffd2c1cfdffb5e66438bc58d3 Mon Sep 17 00:00:00 2001
From: madtibo <madtibo_git@tribu-ml.fr>
Date: Tue, 12 Feb 2019 18:08:26 +0100
Subject: [PATCH 2/9] make it possible to keep downloaded archive

---
 load_into_pg.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/load_into_pg.py b/load_into_pg.py
index ba28cf3..45a6a13 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 import sys
 import time
 import argparse
@@ -245,7 +245,7 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
                         if post != '':
                             cur.execute(post)
                             conn.commit()
-                        six.print_('Post processing took {} seconds'.format(time.time() - start_time))
+                        six.print_('Post processing took {0:.1f} seconds'.format(time.time() - start_time))
                         if createFk:
                             # fk-processing (creation of foreign keys)
                             start_time = time.time()
@@ -253,7 +253,7 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
                             if post != '':
                                 cur.execute(fk)
                                 conn.commit()
-                            six.print_('fk processing took {} seconds'.format(time.time() - start_time))
+                            six.print_('fk processing took {0:.1f} seconds'.format(time.time() - start_time))
 
                 except IOError as e:
                     six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr)
@@ -393,19 +393,22 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 
 # load a project
 elif args.so_project:
-    import urllib.request
     import libarchive
 
-    # download the 7z archive in /tmp
-    file_name = args.so_project + '.stackexchange.com.7z'
-    url = '{0}/{1}'.format(args.archive_url, file_name)
-    filepath = '/tmp/'+file_name
-    six.print_('Downloading the archive, please be patient ...')
-    try:
-        urllib.request.urlretrieve(url, filepath, show_progress)
-    except Exception as e:
-        six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e))
-        exit(1)
+    filepath = None
+    if args.file:
+        filepath = args.file
+    else:
+        # download the 7z archive in /tmp
+        file_name = args.so_project + '.stackexchange.com.7z'
+        url = '{0}/{1}'.format(args.archive_url, file_name)
+        filepath = '/tmp/'+file_name
+        six.print_('Downloading the archive, please be patient ...')
+        try:
+            six.moves.urllib.request.urlretrieve(url, filepath, show_progress)
+        except Exception as e:
+            six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e))
+            exit(1)
 
     try:
         libarchive.extract_file(filepath)
@@ -417,7 +420,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 
     for table in tables:
         six.print_('Load {0}.xml file'.format(table))
-        handleTable(table, args.insert_json, args.foreign_keys, args.file, dbConnectionParam)
+        handleTable(table, args.insert_json, args.foreign_keys, None, dbConnectionParam)
         # remove file
         os.remove(table+'.xml')
 

From 2b39ef6e65caad859eb3c4f7cd698e098de2b309 Mon Sep 17 00:00:00 2001
From: Utkarsh Upadhyay <502876+musically-ut@users.noreply.github.com>
Date: Thu, 11 Apr 2019 18:12:08 +0200
Subject: [PATCH 3/9] Minor spacing fixes.

---
 load_into_pg.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/load_into_pg.py b/load_into_pg.py
index 45a6a13..ca8e34a 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -272,10 +272,10 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
         with pg.connect(dbConnectionParam) as conn:
             with conn.cursor() as cur:
                 # create the schema
-                cur.execute('CREATE SCHEMA IF NOT EXISTS '+schemaName+';')
+                cur.execute('CREATE SCHEMA IF NOT EXISTS ' + schemaName + ';')
                 conn.commit()
                 # move the table to the right schema
-                cur.execute('ALTER TABLE '+table+' SET SCHEMA '+schemaName+';')
+                cur.execute('ALTER TABLE '+table+' SET SCHEMA ' + schemaName + ';')
                 conn.commit()
     except pg.Error as e:
         six.print_("Error in dealing with the database.", file=sys.stderr)
@@ -305,7 +305,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
                    )
 
 parser.add_argument( '-s', '--so-project'
-                   , help    = 'stackexchange project to load.'
+                   , help    = 'StackExchange project to load.'
                    , default = None
                    )
 
@@ -315,8 +315,8 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
                    )
 
 parser.add_argument( '-k', '--keep-archive'
-                   , help    = 'should we keep the downloaded archive.'
-                   , action = 'store_true'
+                   , help    = 'Will preserve the downloaded archive instead of deleting it.'
+                   , action  = 'store_true'
                    , default = False
                    )
 

From 22b0de5ef1bfe07792d857c8af17865a8cb84355 Mon Sep 17 00:00:00 2001
From: madtibo <madtibo_git@tribu-ml.fr>
Date: Wed, 17 Apr 2019 12:09:31 +0200
Subject: [PATCH 4/9] store downloaded file in temporary dir using python
 tempfile library

---
 README.md       |  2 +-
 load_into_pg.py | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 6b021c5..d81dd89 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
 ## Dependencies
 
  - [`lxml`](http://lxml.de/installation.html)
- - [`psychopg2`](http://initd.org/psycopg/docs/install.html)
+ - [`psycopg2`](http://initd.org/psycopg/docs/install.html)
 
 ## Usage
 
diff --git a/load_into_pg.py b/load_into_pg.py
index ca8e34a..21e8dee 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -394,16 +394,23 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 # load a project
 elif args.so_project:
     import libarchive
+    import tempfile
 
     filepath = None
     if args.file:
         filepath = args.file
+        url = filepath
     else:
-        # download the 7z archive in /tmp
+        # download the 7z archive in tempdir
         file_name = args.so_project + '.stackexchange.com.7z'
         url = '{0}/{1}'.format(args.archive_url, file_name)
-        filepath = '/tmp/'+file_name
-        six.print_('Downloading the archive, please be patient ...')
+        temp_dir = tempfile.gettempdir()
+        if temp_dir == 'None':
+            six.print_('WARNING: Could not find temporary directory. Use current directory instead.')
+            temp_dir = os.getcwd()
+        filepath = os.path.join(temp_dir, file_name)
+        six.print_('Downloading the archive in {0}'.format(filepath))
+        six.print_('please be patient ...')
         try:
             six.moves.urllib.request.urlretrieve(url, filepath, show_progress)
         except Exception as e:

From d9e17353014dba890d4ebfb4da90485e89d491d3 Mon Sep 17 00:00:00 2001
From: madtibo <madtibo_git@tribu-ml.fr>
Date: Fri, 26 Apr 2019 16:36:54 +0200
Subject: [PATCH 5/9] store in temp directory and remove it if needed

---
 load_into_pg.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/load_into_pg.py b/load_into_pg.py
index 21e8dee..3fedb7a 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -397,6 +397,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
     import tempfile
 
     filepath = None
+    temp_dir = None
     if args.file:
         filepath = args.file
         url = filepath
@@ -404,10 +405,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
         # download the 7z archive in tempdir
         file_name = args.so_project + '.stackexchange.com.7z'
         url = '{0}/{1}'.format(args.archive_url, file_name)
-        temp_dir = tempfile.gettempdir()
-        if temp_dir == 'None':
-            six.print_('WARNING: Could not find temporary directory. Use current directory instead.')
-            temp_dir = os.getcwd()
+        temp_dir = tempfile.mkdtemp(prefix='so_')
         filepath = os.path.join(temp_dir, file_name)
         six.print_('Downloading the archive in {0}'.format(filepath))
         six.print_('please be patient ...')
@@ -432,8 +430,12 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
         os.remove(table+'.xml')
 
     if not args.keep_archive:
-        # remove archive
         os.remove(filepath)
+        if temp_dir:
+            # remove the archive and the temporary directory
+            os.rmdir(temp_dir)
+        else:
+            six.print_("Archive '{0}' deleted".format(filepath))
 
     if args.schema_name != 'public':
         for table in tables:

From 588b74041fb3318b2f97c81803400d2c9ce14d91 Mon Sep 17 00:00:00 2001
From: Utkarsh Upadhyay <musically.ut@gmail.com>
Date: Thu, 2 May 2019 00:53:13 +0200
Subject: [PATCH 6/9] Adds libarchive as a dependency.

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d81dd89..24146e2 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
 
  - [`lxml`](http://lxml.de/installation.html)
  - [`psycopg2`](http://initd.org/psycopg/docs/install.html)
+ - [`libarchive`](https://www.libarchive.org/)
 
 ## Usage
 

From 7bede10d79750bb2cfeb52a6048fb0aad8bd97ec Mon Sep 17 00:00:00 2001
From: Utkarsh Upadhyay <musically.ut@gmail.com>
Date: Thu, 2 May 2019 15:27:04 +0200
Subject: [PATCH 7/9] Addresses the correct libarchive library.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 24146e2..36e9b2b 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
 
  - [`lxml`](http://lxml.de/installation.html)
  - [`psycopg2`](http://initd.org/psycopg/docs/install.html)
- - [`libarchive`](https://www.libarchive.org/)
+ - [`libarchive-c`](https://pypi.org/project/libarchive-c/)
 
 ## Usage
 

From dc79fdab7e20aff873f13425c0467ea52cc3068e Mon Sep 17 00:00:00 2001
From: Utkarsh Upadhyay <musically.ut@gmail.com>
Date: Thu, 2 May 2019 19:38:20 +0200
Subject: [PATCH 8/9] Fixes spacing issues.

---
 load_into_pg.py | 103 ++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 47 deletions(-)

diff --git a/load_into_pg.py b/load_into_pg.py
index 3fedb7a..c1b53aa 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+
 import sys
 import time
 import argparse
@@ -16,17 +17,19 @@
 # part of the file already downloaded
 file_part = None
 
+
 def show_progress(block_num, block_size, total_size):
-    """Display the total size of the file to download and the progess in percent"""
+    """Display the total size of the file to download and the progress in percent"""
     global file_part
     if file_part is None:
-        suffixes=['B','KB','MB','GB','TB']
+        suffixes = ['B', 'KB', 'MB', 'GB', 'TB']
         suffixIndex = 0
         pp_size = total_size
         while pp_size > 1024:
-            suffixIndex += 1 #increment the index of the suffix
-            pp_size = pp_size/1024.0 #apply the division
-        six.print_('Total file size is: {0:.1f} {1}'.format(pp_size,suffixes[suffixIndex]))
+            suffixIndex += 1  # Increment the index of the suffix
+            pp_size = pp_size / 1024.0  # Apply the division
+        six.print_('Total file size is: {0:.1f} {1}'
+                   .format(pp_size, suffixes[suffixIndex]))
         six.print_("0 % of the file downloaded ...\r", end="", flush=True)
         file_part = 0
 
@@ -40,6 +43,7 @@ def show_progress(block_num, block_size, total_size):
         file_part = None
         six.print_("")
 
+
 def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword):
     dbConnectionParam = "dbname={}".format(dbname)
 
@@ -58,20 +62,23 @@ def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword):
         dbConnectionParam += ' password={}'.format(mbPassword)
     return dbConnectionParam
 
+
 def _makeDefValues(keys):
     """Returns a dictionary containing None for all keys."""
-    return dict(( (k, None) for k in keys ))
+    return dict(((k, None) for k in keys))
+
 
 def _createMogrificationTemplate(table, keys, insertJson):
     """Return the template string for mogrification for the given keys."""
-    table_keys = ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules
-                              else specialRules[table, k]
-                              for k in keys ])
+    table_keys = ', '.join(['%(' + k + ')s' if (table, k) not in specialRules
+                            else specialRules[table, k]
+                            for k in keys])
     if insertJson:
         return ('(' + table_keys + ', %(jsonfield)s' + ')')
     else:
         return ('(' + table_keys + ')')
 
+
 def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
     """Use the cursor to mogrify a tuple of data.
     The passed data in `attribs` is augmented with default data (NULLs) and the
@@ -83,14 +90,14 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
     defs.update(attribs)
 
     if insertJson:
-        dict_attribs = { }
+        dict_attribs = {}
         for name, value in attribs.items():
             dict_attribs[name] = value
         defs['jsonfield'] = json.dumps(dict_attribs)
 
-    values_to_insert = cursor.mogrify(templ, defs)
     return cursor.mogrify(templ, defs)
 
+
 def _getTableKeys(table):
     """Return an array of the keys for a given table"""
     keys = None
@@ -177,25 +184,26 @@ def _getTableKeys(table):
         ]
     elif table == 'PostHistory':
         keys = [
-            'Id',
-            'PostHistoryTypeId',
-            'PostId',
-            'RevisionGUID',
-            'CreationDate',
-            'UserId',
-            'Text'
+            'Id'
+            , 'PostHistoryTypeId'
+            , 'PostId'
+            , 'RevisionGUID'
+            , 'CreationDate'
+            , 'UserId'
+            , 'Text'
         ]
     elif table == 'Comments':
         keys = [
-            'Id',
-            'PostId',
-            'Score',
-            'Text',
-            'CreationDate',
-            'UserId',
+            'Id'
+            , 'PostId'
+            , 'Score'
+            , 'Text'
+            , 'CreationDate'
+            , 'UserId'
         ]
     return keys
 
+
 def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
     """Handle the table including the post/pre processing."""
     keys       = _getTableKeys(table)
@@ -228,10 +236,10 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
                         six.print_('Processing data ...')
                         for rows in Processor.batch(Processor.parse(xml), 500):
                             valuesStr = ',\n'.join(
-                                            [ _createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8')
-                                                for row_attribs in rows
-                                            ]
-                                        )
+                                [_createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8')
+                                 for row_attribs in rows
+                                 ]
+                            )
                             if len(valuesStr) > 0:
                                 cmd = 'INSERT INTO ' + table + \
                                       ' VALUES\n' + valuesStr + ';'
@@ -249,11 +257,11 @@ def handleTable(table, insertJson, createFk, mbDbFile, dbConnectionParam):
                         if createFk:
                             # fk-processing (creation of foreign keys)
                             start_time = time.time()
-                            six.print_('fk processing ...')
+                            six.print_('Foreign Key processing ...')
                             if post != '':
                                 cur.execute(fk)
                                 conn.commit()
-                            six.print_('fk processing took {0:.1f} seconds'.format(time.time() - start_time))
+                            six.print_('Foreign Key processing took {0:.1f} seconds'.format(time.time() - start_time))
 
                 except IOError as e:
                     six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr)
@@ -275,7 +283,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
                 cur.execute('CREATE SCHEMA IF NOT EXISTS ' + schemaName + ';')
                 conn.commit()
                 # move the table to the right schema
-                cur.execute('ALTER TABLE '+table+' SET SCHEMA ' + schemaName + ';')
+                cur.execute('ALTER TABLE ' + table + ' SET SCHEMA ' + schemaName + ';')
                 conn.commit()
     except pg.Error as e:
         six.print_("Error in dealing with the database.", file=sys.stderr)
@@ -288,76 +296,76 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 #############################################################
 
 parser = argparse.ArgumentParser()
-parser.add_argument( '-t', '--table'
+parser.add_argument('-t', '--table'
                    , help    = 'The table to work on.'
                    , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments']
                    , default = None
                    )
 
-parser.add_argument( '-d', '--dbname'
+parser.add_argument('-d', '--dbname'
                    , help    = 'Name of database to create the table in. The database must exist.'
                    , default = 'stackoverflow'
                    )
 
-parser.add_argument( '-f', '--file'
+parser.add_argument('-f', '--file'
                    , help    = 'Name of the file to extract data from.'
                    , default = None
                    )
 
-parser.add_argument( '-s', '--so-project'
+parser.add_argument('-s', '--so-project'
                    , help    = 'StackExchange project to load.'
                    , default = None
                    )
 
-parser.add_argument( '--archive-url'
+parser.add_argument('--archive-url'
                    , help    = 'URL of the archive directory to retrieve.'
                    , default = 'https://ia800107.us.archive.org/27/items/stackexchange'
                    )
 
-parser.add_argument( '-k', '--keep-archive'
+parser.add_argument('-k', '--keep-archive'
                    , help    = 'Will preserve the downloaded archive instead of deleting it.'
                    , action  = 'store_true'
                    , default = False
                    )
 
-parser.add_argument( '-u', '--username'
+parser.add_argument('-u', '--username'
                    , help    = 'Username for the database.'
                    , default = None
                    )
 
-parser.add_argument( '-p', '--password'
+parser.add_argument('-p', '--password'
                    , help    = 'Password for the database.'
                    , default = None
                    )
 
-parser.add_argument( '-P', '--port'
+parser.add_argument('-P', '--port'
                    , help    = 'Port to connect with the database on.'
                    , default = None
                    )
 
-parser.add_argument( '-H', '--host'
+parser.add_argument('-H', '--host'
                    , help    = 'Hostname for the database.'
                    , default = None
                    )
 
-parser.add_argument( '--with-post-body'
+parser.add_argument('--with-post-body'
                    , help   = 'Import the posts with the post body. Only used if importing Posts.xml'
                    , action = 'store_true'
                    , default = False
                    )
 
-parser.add_argument( '-j', '--insert-json'
+parser.add_argument('-j', '--insert-json'
                    , help    = 'Insert raw data as JSON.'
                    , action = 'store_true'
                    , default = False
                    )
 
-parser.add_argument( '-n', '--schema-name'
+parser.add_argument('-n', '--schema-name'
                    , help    = 'Use specific schema.'
                    , default = 'public'
                    )
 
-parser.add_argument( '--foreign-keys'
+parser.add_argument('--foreign-keys'
                    , help    = 'Create foreign keys.'
                    , action = 'store_true'
                    , default = False
@@ -421,13 +429,14 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
         six.print_('Error: impossible to extract the {0} archive ({1})'.format(url, e))
         exit(1)
 
-    tables = [ 'Tags', 'Users', 'Badges', 'Posts', 'Comments', 'Votes', 'PostLinks', 'PostHistory' ]
+    tables = ['Tags', 'Users', 'Badges', 'Posts', 'Comments',
+              'Votes', 'PostLinks', 'PostHistory']
 
     for table in tables:
         six.print_('Load {0}.xml file'.format(table))
         handleTable(table, args.insert_json, args.foreign_keys, None, dbConnectionParam)
         # remove file
-        os.remove(table+'.xml')
+        os.remove(table + '.xml')
 
     if not args.keep_archive:
         os.remove(filepath)

From e0be1b2982d1d11e73cd37ed2536df204cab02a5 Mon Sep 17 00:00:00 2001
From: Utkarsh Upadhyay <musically.ut@gmail.com>
Date: Thu, 2 May 2019 19:47:26 +0200
Subject: [PATCH 9/9] Fix minor spacing.

---
 load_into_pg.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/load_into_pg.py b/load_into_pg.py
index c1b53aa..d7c3158 100755
--- a/load_into_pg.py
+++ b/load_into_pg.py
@@ -295,6 +295,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 
 #############################################################
 
+
 parser = argparse.ArgumentParser()
 parser.add_argument('-t', '--table'
                    , help    = 'The table to work on.'
@@ -349,14 +350,14 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
                    )
 
 parser.add_argument('--with-post-body'
-                   , help   = 'Import the posts with the post body. Only used if importing Posts.xml'
-                   , action = 'store_true'
+                   , help    = 'Import the posts with the post body. Only used if importing Posts.xml'
+                   , action  = 'store_true'
                    , default = False
                    )
 
 parser.add_argument('-j', '--insert-json'
                    , help    = 'Insert raw data as JSON.'
-                   , action = 'store_true'
+                   , action  = 'store_true'
                    , default = False
                    )
 
@@ -367,7 +368,7 @@ def moveTableToSchema(table, schemaName, dbConnectionParam):
 
 parser.add_argument('--foreign-keys'
                    , help    = 'Create foreign keys.'
-                   , action = 'store_true'
+                   , action  = 'store_true'
                    , default = False
                    )