diff --git a/README.md b/README.md index 826a4ef..2781792 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,72 @@ # StackOverflow data to postgres -This is a quick script to move the Stackoverflow data from the [StackExchange data dump (Sept '14)](https://archive.org/details/stackexchange) to a Postgres SQL database. - -Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede) and from [StackExchange Data Explorer](http://data.stackexchange.com). - -## Dependencies - - - [`lxml`](http://lxml.de/installation.html) - - [`psychopg2`](http://initd.org/psycopg/docs/install.html) - -## Usage - - - Create the database `stackoverflow` in your database: `CREATE DATABASE stackoverflow;` - - You can use a custom database name as well. Make sure to explicitly give - it while executing the script later. - - Move the following files to the folder from where the program is executed: - `Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`. - - In some old dumps, the cases in the filenames are different. - - Execute in the current folder (in parallel, if desired): - - `python load_into_pg.py Badges` - - `python load_into_pg.py Posts` - - `python load_into_pg.py Tags` (not present in earliest dumps) - - `python load_into_pg.py Users` - - `python load_into_pg.py Votes` - - `python load_into_pg.py PostLinks` - - `python load_into_pg.py PostHistory` - - `python load_into_pg.py Comments` - - Finally, after all the initial tables have been created: - - `psql stackoverflow < ./sql/final_post.sql` - - If you used a different database name, make sure to use that instead of - `stackoverflow` while executing this step. - - For some additional indexes and tables, you can also execute the the following; - - `psql stackoverflow < ./sql/optional_post.sql` - - Again, remember to user the correct database name here, if not `stackoverflow`. +This is a quick script to move the Stackoverflow data from the [StackExchange +data dump (Sept '14)](https://archive.org/details/stackexchange) to a Postgres +SQL database. + +Schema hints are taken from [a post on +Meta.StackExchange](http://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede) +and from [StackExchange Data Explorer](http://data.stackexchange.com). + +## Quickstart + +Install requirements, create a new database (e.g. `beerSO` below), and use `load_into_pg.py` script: + +``` console +$ pip install -r requirements.txt +... +Successfully installed argparse-1.2.1 libarchive-c-2.9 lxml-4.5.2 psycopg2-binary-2.8.4 six-1.10.0 +$ createdb beerSO +$ python load_into_pg.py -s beer -d beerSO +``` + +This will download compressed files from +[archive.org](https://ia800107.us.archive.org/27/items/stackexchange/) and load +all the tables at once. + + +## Advanced Usage + +You can use a custom database name as well. Make sure to explicitly give it +while executing the script later. + +Each table data is archived in an XML file. Available tables varies accross +history. `load_into_pg.py` knows how to handle the following tables: + +- `Badges`. +- `Posts`. +- `Tags` (not present in earliest dumps). +- `Users`. +- `Votes`. +- `PostLinks`. +- `PostHistory`. +- `Comments`. + +You can download manually the files to the folder from where the program is +executed: `Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`. In +some old dumps, the cases in the filenames are different. + +Then load each file with e.g. `python load_into_pg.py -t Badges`. + +After all the initial tables have been created: + +``` console +$ psql beerSO < ./sql/final_post.sql +``` + +For some additional indexes and tables, you can also execute the the following; + +``` console +$ psql beerSO < ./sql/optional_post.sql +``` + +If you give a schema name using the `-n` switch, all the tables will be moved +to the given schema. This schema will be created in the script. + +The paths are not changed in the final scripts `sql/final_post.sql` and +`sql/optional_post.sql`. To run them, first set the _search_path_ to your +schema name: `SET search_path TO ;` + ## Caveats and TODOs @@ -44,3 +78,10 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch - The `tags.xml` is missing from the data dump. Hence, the `PostTag` and `UserTagQA` tables will be empty after `final_post.sql`. - The `ViewCount` in `Posts` is sometimes equal to an `empty` value. It is replaced by `NULL` in those cases. + + +## Acknowledgement + + - [@madtibo](https://github.com/madtibo) made significant contributions by adding `jsonb` and Foreign Key support. + - [@bersace](https://github.com/bersace) brought the dependencies and the `README.md` instructions into 2020s. + - [@rdrg109](https://github.com/rdrg109) simplified handling of non-public schemas and fixed bugs associated with re-importing tables. diff --git a/load_into_pg.py b/load_into_pg.py index c65e854..2d6cef5 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -1,109 +1,275 @@ #!/usr/bin/env python + import sys import time import argparse import psycopg2 as pg +import os import row_processor as Processor import six +import json # Special rules needed for certain tables (esp. for old database dumps) -specialRules = { - ('Posts', 'ViewCount'): "NULLIF(%(ViewCount)s, '')::int" -} +specialRules = {("Posts", "ViewCount"): "NULLIF(%(ViewCount)s, '')::int"} + +# part of the file already downloaded +file_part = None + + +def show_progress(block_num, block_size, total_size): + """Display the total size of the file to download and the progress in percent""" + global file_part + if file_part is None: + suffixes = ["B", "KB", "MB", "GB", "TB"] + suffixIndex = 0 + pp_size = total_size + while pp_size > 1024: + suffixIndex += 1 # Increment the index of the suffix + pp_size = pp_size / 1024.0 # Apply the division + six.print_( + "Total file size is: {0:.1f} {1}".format(pp_size, suffixes[suffixIndex]) + ) + six.print_("0 % of the file downloaded ...\r", end="", flush=True) + file_part = 0 + + downloaded = block_num * block_size + if downloaded < total_size: + percent = 100 * downloaded / total_size + if percent - file_part > 1: + file_part = percent + six.print_( + "{0} % of the file downloaded ...\r".format(int(percent)), + end="", + flush=True, + ) + else: + file_part = None + six.print_("") + +def getConnectionParameters(): + """Get the parameters for the connection to the database.""" + + parameters = {} + + if args.dbname: + parameters['dbname'] = args.dbname + + if args.host: + parameters['host'] = args.host + + if args.port: + parameters['port'] = args.port + + if args.username: + parameters['user'] = args.username + + if args.password: + parameters['password'] = args.password + + if args.schema_name: + parameters['options'] = "-c search_path=" + args.schema_name + + return parameters + def _makeDefValues(keys): """Returns a dictionary containing None for all keys.""" - return dict(( (k, None) for k in keys )) + return dict(((k, None) for k in keys)) -def _createMogrificationTemplate(table, keys): + +def _createMogrificationTemplate(table, keys, insertJson): """Return the template string for mogrification for the given keys.""" - return ( '(' + - ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules else specialRules[table, k] - for k in keys - ] - ) + - ')' - ) - -def _createCmdTuple(cursor, keys, templ, attribs): + table_keys = ", ".join( + [ + "%(" + k + ")s" + if (table, k) not in specialRules + else specialRules[table, k] + for k in keys + ] + ) + if insertJson: + return "(" + table_keys + ", %(jsonfield)s" + ")" + else: + return "(" + table_keys + ")" + + +def _createCmdTuple(cursor, keys, templ, attribs, insertJson): """Use the cursor to mogrify a tuple of data. The passed data in `attribs` is augmented with default data (NULLs) and the order of data in the tuple is the same as in the list of `keys`. The - `cursor` is used toe mogrify the data and the `templ` is the template used + `cursor` is used to mogrify the data and the `templ` is the template used for the mogrification. """ defs = _makeDefValues(keys) defs.update(attribs) + + if insertJson: + dict_attribs = {} + for name, value in attribs.items(): + dict_attribs[name] = value + defs["jsonfield"] = json.dumps(dict_attribs) + return cursor.mogrify(templ, defs) -def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): + +def _getTableKeys(table): + """Return an array of the keys for a given table""" + keys = None + if table == "Users": + keys = [ + "Id", + "Reputation", + "CreationDate", + "DisplayName", + "LastAccessDate", + "WebsiteUrl", + "Location", + "AboutMe", + "Views", + "UpVotes", + "DownVotes", + "ProfileImageUrl", + "Age", + "AccountId", + ] + elif table == "Badges": + keys = ["Id", "UserId", "Name", "Date"] + elif table == "PostLinks": + keys = ["Id", "CreationDate", "PostId", "RelatedPostId", "LinkTypeId"] + elif table == "Comments": + keys = ["Id", "PostId", "Score", "Text", "CreationDate", "UserId"] + elif table == "Votes": + keys = ["Id", "PostId", "VoteTypeId", "UserId", "CreationDate", "BountyAmount"] + elif table == "Posts": + keys = [ + "Id", + "PostTypeId", + "AcceptedAnswerId", + "ParentId", + "CreationDate", + "Score", + "ViewCount", + "Body", + "OwnerUserId", + "LastEditorUserId", + "LastEditorDisplayName", + "LastEditDate", + "LastActivityDate", + "Title", + "Tags", + "AnswerCount", + "CommentCount", + "FavoriteCount", + "ClosedDate", + "CommunityOwnedDate", + ] + elif table == "Tags": + keys = ["Id", "TagName", "Count", "ExcerptPostId", "WikiPostId"] + elif table == "PostHistory": + keys = [ + "Id", + "PostHistoryTypeId", + "PostId", + "RevisionGUID", + "CreationDate", + "UserId", + "Text", + ] + elif table == "Comments": + keys = ["Id", "PostId", "Score", "Text", "CreationDate", "UserId"] + return keys + + +def handleTable(table, insertJson, createFk, mbDbFile): """Handle the table including the post/pre processing.""" - dbFile = mbDbFile if mbDbFile is not None else table + '.xml' - tmpl = _createMogrificationTemplate(table, keys) + keys = _getTableKeys(table) + dbFile = mbDbFile if mbDbFile is not None else table + ".xml" + tmpl = _createMogrificationTemplate(table, keys, insertJson) start_time = time.time() try: - pre = open('./sql/' + table + '_pre.sql').read() - post = open('./sql/' + table + '_post.sql').read() + pre = open("./sql/" + table + "_pre.sql").read() + post = open("./sql/" + table + "_post.sql").read() + fk = open("./sql/" + table + "_fk.sql").read() except IOError as e: - six.print_("Could not load pre/post sql. Are you running from the correct path?", file=sys.stderr) + six.print_( + "Could not load pre/post/fk sql. Are you running from the correct path?", + file=sys.stderr, + ) sys.exit(-1) - dbConnectionParam = "dbname={}".format(dbname) - - if mbPort is not None: - dbConnectionParam += ' port={}'.format(mbPort) - - if mbHost is not None: - dbConnectionParam += ' host={}'.format(mbHost) - - # TODO Is the escaping done here correct? - if mbUsername is not None: - dbConnectionParam += ' user={}'.format(mbUsername) - - # TODO Is the escaping done here correct? - if mbPassword is not None: - dbConnectionParam += ' password={}'.format(mbPassword) - try: - with pg.connect(dbConnectionParam) as conn: + with pg.connect(**getConnectionParameters()) as conn: with conn.cursor() as cur: try: - with open(dbFile, 'rb') as xml: + with open(dbFile, "rb") as xml: # Pre-processing (dropping/creation of tables) - six.print_('Pre-processing ...') - if pre != '': + six.print_("Pre-processing ...") + if pre != "": cur.execute(pre) conn.commit() - six.print_('Pre-processing took {:.1f} seconds'.format(time.time() - start_time)) + six.print_( + "Pre-processing took {:.1f} seconds".format( + time.time() - start_time + ) + ) # Handle content of the table start_time = time.time() - six.print_('Processing data ...') + six.print_("Processing data ...") for rows in Processor.batch(Processor.parse(xml), 500): - valuesStr = ',\n'.join( - [ _createCmdTuple(cur, keys, tmpl, row_attribs).decode('utf-8') - for row_attribs in rows - ] - ) - + valuesStr = ",\n".join( + [ + _createCmdTuple( + cur, keys, tmpl, row_attribs, insertJson + ).decode("utf-8") + for row_attribs in rows + ] + ) if len(valuesStr) > 0: - cmd = 'INSERT INTO ' + table + \ - ' VALUES\n' + valuesStr + ';' + cmd = ( + "INSERT INTO " + + table + + " VALUES\n" + + valuesStr + + ";" + ) cur.execute(cmd) conn.commit() - six.print_('Table processing took {:.1f} seconds'.format(time.time() - start_time)) + six.print_( + "Table '{0}' processing took {1:.1f} seconds".format( + table, time.time() - start_time + ) + ) # Post-processing (creation of indexes) start_time = time.time() - six.print_('Post processing ...') - if post != '': + six.print_("Post processing ...") + if post != "": cur.execute(post) conn.commit() - six.print_('Post processing took {} seconds'.format(time.time() - start_time)) + six.print_( + "Post processing took {0:.1f} seconds".format( + time.time() - start_time + ) + ) + if createFk: + # fk-processing (creation of foreign keys) + start_time = time.time() + six.print_("Foreign Key processing ...") + if post != "": + cur.execute(fk) + conn.commit() + six.print_( + "Foreign Key processing took {0:.1f} seconds".format( + time.time() - start_time + ) + ) except IOError as e: - six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr) + six.print_( + "Could not read from file {}.".format(dbFile), file=sys.stderr + ) six.print_("IOError: {0}".format(e.strerror), file=sys.stderr) except pg.Error as e: six.print_("Error in dealing with the database.", file=sys.stderr) @@ -113,173 +279,176 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas six.print_("Warning from the database.", file=sys.stderr) six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr) - - ############################################################# parser = argparse.ArgumentParser() -parser.add_argument( 'table' - , help = 'The table to work on.' - , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments'] - ) - -parser.add_argument( '-d', '--dbname' - , help = 'Name of database to create the table in. The database must exist.' - , default = 'stackoverflow' - ) - -parser.add_argument( '-f', '--file' - , help = 'Name of the file to extract data from.' - , default = None - ) - -parser.add_argument( '-u', '--username' - , help = 'Username for the database.' - , default = None - ) - -parser.add_argument( '-p', '--password' - , help = 'Password for the database.' - , default = None - ) - -parser.add_argument( '-P', '--port' - , help = 'Port to connect with the database on.' - , default = None - ) - -parser.add_argument( '-H', '--host' - , help = 'Hostname for the database.' - , default = None - ) - -parser.add_argument( '--with-post-body' - , help = 'Import the posts with the post body. Only used if importing Posts.xml' - , action = 'store_true' - , default = False - ) +parser.add_argument( + "-t", + "--table", + help="The table to work on.", + choices=[ + "Users", + "Badges", + "Posts", + "Tags", + "Votes", + "PostLinks", + "PostHistory", + "Comments", + ], + default=None, +) + +parser.add_argument( + "-d", + "--dbname", + help="Name of database to create the table in. The database must exist.", + default="stackoverflow", +) + +parser.add_argument( + "-f", "--file", help="Name of the file to extract data from.", default=None +) + +parser.add_argument( + "-s", "--so-project", help="StackExchange project to load.", default=None +) + +parser.add_argument( + "--archive-url", + help="URL of the archive directory to retrieve.", + default="https://ia800107.us.archive.org/27/items/stackexchange", +) + +parser.add_argument( + "-k", + "--keep-archive", + help="Will preserve the downloaded archive instead of deleting it.", + action="store_true", + default=False, +) + +parser.add_argument("-u", "--username", help="Username for the database.", default=None) + +parser.add_argument("-p", "--password", help="Password for the database.", default=None) + +parser.add_argument( + "-P", "--port", help="Port to connect with the database on.", default=None +) + +parser.add_argument("-H", "--host", help="Hostname for the database.", default=None) + +parser.add_argument( + "--with-post-body", + help="Import the posts with the post body. Only used if importing Posts.xml", + action="store_true", + default=False, +) + +parser.add_argument( + "-j", + "--insert-json", + help="Insert raw data as JSON.", + action="store_true", + default=False, +) + +parser.add_argument( + "-n", "--schema-name", help="Use specific schema.", default="public" +) + +parser.add_argument( + "--foreign-keys", help="Create foreign keys.", action="store_true", default=False +) args = parser.parse_args() -table = args.table -keys = None - -if table == 'Users': - keys = [ - 'Id' - , 'Reputation' - , 'CreationDate' - , 'DisplayName' - , 'LastAccessDate' - , 'WebsiteUrl' - , 'Location' - , 'AboutMe' - , 'Views' - , 'UpVotes' - , 'DownVotes' - , 'ProfileImageUrl' - , 'Age' - , 'AccountId' - ] -elif table == 'Badges': - keys = [ - 'Id' - , 'UserId' - , 'Name' - , 'Date' - ] -elif table == 'PostLinks': - keys = [ - 'Id' - , 'CreationDate' - , 'PostId' - , 'RelatedPostId' - , 'LinkTypeId' - ] -elif table == 'Comments': - keys = [ - 'Id' - , 'PostId' - , 'Score' - , 'Text' - , 'CreationDate' - , 'UserId' - ] -elif table == 'Votes': - keys = [ - 'Id' - , 'PostId' - , 'VoteTypeId' - , 'UserId' - , 'CreationDate' - , 'BountyAmount' - ] -elif table == 'Posts': - keys = [ - 'Id' - , 'PostTypeId' - , 'AcceptedAnswerId' - , 'ParentId' - , 'CreationDate' - , 'Score' - , 'ViewCount' - , 'Body' - , 'OwnerUserId' - , 'LastEditorUserId' - , 'LastEditorDisplayName' - , 'LastEditDate' - , 'LastActivityDate' - , 'Title' - , 'Tags' - , 'AnswerCount' - , 'CommentCount' - , 'FavoriteCount' - , 'ClosedDate' - , 'CommunityOwnedDate' - ] - - # If the user has not explicitly asked for loading the body, we replace it with NULL - if not args.with_post_body: - specialRules[('Posts', 'Body')] = 'NULL' - -elif table == 'Tags': - keys = [ - 'Id' - , 'TagName' - , 'Count' - , 'ExcerptPostId' - , 'WikiPostId' - ] -elif table == 'PostHistory': - keys = [ - 'Id', - 'PostHistoryTypeId', - 'PostId', - 'RevisionGUID', - 'CreationDate', - 'UserId', - 'Text' - ] -elif table == 'Comments': - keys = [ - 'Id', - 'PostId', - 'Score', - 'Text', - 'CreationDate', - 'UserId', - ] - try: # Python 2/3 compatibility input = raw_input except NameError: pass -choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) +# load given file in table +if args.file and args.table: + table = args.table + + if table == "Posts": + # If the user has not explicitly asked for loading the body, we replace it with NULL + if not args.with_post_body: + specialRules[("Posts", "Body")] = "NULL" + + choice = input("This will drop the {} table. Are you sure [y/n]?".format(table)) + + if len(choice) > 0 and choice[0].lower() == "y": + handleTable( + table, args.insert_json, args.foreign_keys, args.file) + else: + six.print_("Cancelled.") + + exit(0) + +# load a project +elif args.so_project: + import libarchive + import tempfile + + filepath = None + temp_dir = None + if args.file: + filepath = args.file + url = filepath + else: + # download the 7z archive in tempdir + file_name = args.so_project + ".stackexchange.com.7z" + url = "{0}/{1}".format(args.archive_url, file_name) + temp_dir = tempfile.mkdtemp(prefix="so_") + filepath = os.path.join(temp_dir, file_name) + six.print_("Downloading the archive in {0}".format(filepath)) + six.print_("please be patient ...") + try: + six.moves.urllib.request.urlretrieve(url, filepath, show_progress) + except Exception as e: + six.print_( + "Error: impossible to download the {0} archive ({1})".format(url, e) + ) + exit(1) -if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password) -else: - six.print_("Cancelled.") + try: + libarchive.extract_file(filepath) + except Exception as e: + six.print_("Error: impossible to extract the {0} archive ({1})".format(url, e)) + exit(1) + + tables = [ + "Tags", + "Users", + "Badges", + "Posts", + "Comments", + "Votes", + "PostLinks", + "PostHistory", + ] + + for table in tables: + six.print_("Load {0}.xml file".format(table)) + handleTable(table, args.insert_json, args.foreign_keys, None) + # remove file + os.remove(table + ".xml") + + if not args.keep_archive: + os.remove(filepath) + if temp_dir: + # remove the archive and the temporary directory + os.rmdir(temp_dir) + else: + six.print_("Archive '{0}' deleted".format(filepath)) + exit(0) + +else: + six.print_( + "Error: you must either use '-f' and '-t' arguments or the '-s' argument." + ) + parser.print_help() diff --git a/requirements.txt b/requirements.txt index e1c997c..196607c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ argparse==1.2.1 -distribute==0.6.24 -lxml==3.4.1 -psycopg2==2.5.4 -wsgiref==0.1.2 +libarchive-c==2.9 +lxml==4.6.3 +psycopg2-binary==2.8.4 six==1.10.0 diff --git a/row_processor.py b/row_processor.py index 7aeb09c..fec0814 100644 --- a/row_processor.py +++ b/row_processor.py @@ -2,6 +2,7 @@ from itertools import islice, chain import six + # Efficient parsing of large XML files from # http://stackoverflow.com/a/9814580/987185 def parse(fp): @@ -9,28 +10,31 @@ def parse(fp): returns a generator which yields one row at a time. """ - context = etree.iterparse(fp, events=('end',)) + context = etree.iterparse(fp, events=("end",)) for action, elem in context: - if elem.tag=='row': + if elem.tag == "row": # processing goes here assert elem.text is None, "The row wasn't empty" yield elem.attrib # cleanup # first empty children from current element - # This is not absolutely necessary if you are also deleting - # siblings, but it will allow you to free memory earlier. + # This is not absolutely necessary if you are also deleting + # siblings, but it will allow you to free memory earlier. elem.clear() # second, delete previous siblings (records) while elem.getprevious() is not None: del elem.getparent()[0] # make sure you have no references to Element objects outside the loop + def batch(iterable, size): """Creates a batches of size `size` from the `iterable`.""" sourceiter = iter(iterable) while True: batchiter = islice(sourceiter, size) - yield chain([six.next(batchiter)], batchiter) - + try: + yield chain([six.next(batchiter)], batchiter) + except StopIteration: + return diff --git a/sql/Badges_fk.sql b/sql/Badges_fk.sql new file mode 100644 index 0000000..b5a4e3f --- /dev/null +++ b/sql/Badges_fk.sql @@ -0,0 +1 @@ +ALTER TABLE badges ADD CONSTRAINT fk_badges_userid FOREIGN KEY (userid) REFERENCES users (id); diff --git a/sql/Badges_pre.sql b/sql/Badges_pre.sql index 98a2b34..65944d9 100644 --- a/sql/Badges_pre.sql +++ b/sql/Badges_pre.sql @@ -3,5 +3,6 @@ CREATE TABLE Badges ( Id int PRIMARY KEY , UserId int not NULL , Name text not NULL , - Date timestamp not NULL + Date timestamp not NULL , + jsonfield jsonb ); diff --git a/sql/Comments_fk.sql b/sql/Comments_fk.sql new file mode 100644 index 0000000..aea00c9 --- /dev/null +++ b/sql/Comments_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Comments ADD CONSTRAINT fk_comments_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Comments ADD CONSTRAINT fk_comments_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/Comments_post.sql b/sql/Comments_post.sql index e19e8b8..2c3e7a2 100644 --- a/sql/Comments_post.sql +++ b/sql/Comments_post.sql @@ -6,4 +6,4 @@ CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId) CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate) WITH (FILLFACTOR = 100); CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId) - WITH (FILLFACTOR = 100); \ No newline at end of file + WITH (FILLFACTOR = 100); diff --git a/sql/Comments_pre.sql b/sql/Comments_pre.sql index 6942df6..43f166c 100644 --- a/sql/Comments_pre.sql +++ b/sql/Comments_pre.sql @@ -5,5 +5,6 @@ CREATE TABLE Comments ( Score int not NULL , Text text , CreationDate timestamp not NULL , - UserId int + UserId int , + jsonfield jsonb ); diff --git a/sql/PostHistory_fk.sql b/sql/PostHistory_fk.sql new file mode 100644 index 0000000..91379eb --- /dev/null +++ b/sql/PostHistory_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/PostHistory_pre.sql b/sql/PostHistory_pre.sql index 24684d1..361dd3d 100644 --- a/sql/PostHistory_pre.sql +++ b/sql/PostHistory_pre.sql @@ -6,5 +6,6 @@ CREATE TABLE PostHistory ( RevisionGUID text , CreationDate timestamp not NULL , UserId int , - PostText text + PostText text , + jsonfield jsonb ); diff --git a/sql/PostLinks_fk.sql b/sql/PostLinks_fk.sql new file mode 100644 index 0000000..5c40cb4 --- /dev/null +++ b/sql/PostLinks_fk.sql @@ -0,0 +1,13 @@ +-- impossible to enforce these constraints, set as 'not valid' to disable +-- initial test. +-- +-- These constaints can be forced running the following queries: +-- ALTER TABLE postlinks ALTER postid DROP NOT NULL; +-- UPDATE postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_postid; +-- ALTER TABLE postlinks ALTER relatedpostid DROP NOT NULL; +-- UPDATE postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_relatedpostid; +-- +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID; +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id) NOT VALID; diff --git a/sql/PostLinks_pre.sql b/sql/PostLinks_pre.sql index aaa258c..3793522 100644 --- a/sql/PostLinks_pre.sql +++ b/sql/PostLinks_pre.sql @@ -4,5 +4,6 @@ CREATE TABLE PostLinks ( CreationDate timestamp not NUll , PostId int not NULL , RelatedPostId int not NULL , - LinkTypeId int not Null + LinkTypeId int not Null , + jsonfield jsonb ); diff --git a/sql/Posts_fk.sql b/sql/Posts_fk.sql new file mode 100644 index 0000000..65fea37 --- /dev/null +++ b/sql/Posts_fk.sql @@ -0,0 +1,3 @@ +ALTER TABLE Posts ADD CONSTRAINT fk_posts_parentid FOREIGN KEY (parentid) REFERENCES posts (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_owneruserid FOREIGN KEY (owneruserid) REFERENCES users (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_lasteditoruserid FOREIGN KEY (lasteditoruserid) REFERENCES users (id); diff --git a/sql/Posts_pre.sql b/sql/Posts_pre.sql index 60f7239..ed4d75e 100644 --- a/sql/Posts_pre.sql +++ b/sql/Posts_pre.sql @@ -19,6 +19,7 @@ CREATE TABLE Posts ( CommentCount int , FavoriteCount int , ClosedDate timestamp , - CommunityOwnedDate timestamp + CommunityOwnedDate timestamp , + jsonfield jsonb ); diff --git a/sql/Tags_fk.sql b/sql/Tags_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Tags_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Tags_pre.sql b/sql/Tags_pre.sql index 26979fe..24dd050 100644 --- a/sql/Tags_pre.sql +++ b/sql/Tags_pre.sql @@ -2,7 +2,8 @@ DROP TABLE IF EXISTS Tags CASCADE; CREATE TABLE Tags ( Id int PRIMARY KEY , TagName text not NULL , - Count int, - ExcerptPostId int, - WikiPostId int + Count int , + ExcerptPostId int , + WikiPostId int , + jsonfield jsonb ); diff --git a/sql/Users_fk.sql b/sql/Users_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Users_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Users_pre.sql b/sql/Users_pre.sql index 4246be3..ad188cf 100644 --- a/sql/Users_pre.sql +++ b/sql/Users_pre.sql @@ -13,6 +13,7 @@ CREATE TABLE Users ( DownVotes int not NULL , ProfileImageUrl text , Age int , - AccountId int -- NULL accountId == deleted account? + AccountId int , -- NULL accountId == deleted account? + jsonfield jsonb ); diff --git a/sql/Votes_fk.sql b/sql/Votes_fk.sql new file mode 100644 index 0000000..a52a2a1 --- /dev/null +++ b/sql/Votes_fk.sql @@ -0,0 +1,10 @@ +ALTER TABLE Votes ADD CONSTRAINT fk_votes_userid FOREIGN KEY (userid) REFERENCES users (id); +-- impossible to enforce this constraint, set as 'not valid' to disable +-- initial test. +-- +-- This constaint can be forced running the following queries: +-- ALTER TABLE votes ALTER PostId DROP NOT NULL; +-- UPDATE votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE votes VALIDATE CONSTRAINT fk_votes_postid; +-- +ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID; diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql index 2a9b5ff..3ed0b53 100644 --- a/sql/Votes_pre.sql +++ b/sql/Votes_pre.sql @@ -1,10 +1,11 @@ DROP TABLE IF EXISTS Votes CASCADE; CREATE TABLE Votes ( Id int PRIMARY KEY , - PostId int not NULL , + PostId int , -- not NULL , VoteTypeId int not NULL , UserId int , CreationDate timestamp not NULL , - BountyAmount int + BountyAmount int , + jsonfield jsonb );