From 1e833bd19c0118b5f3478595d878a93e564b6a57 Mon Sep 17 00:00:00 2001 From: Yamen Ajjour Date: Wed, 27 Dec 2017 14:08:51 +0100 Subject: [PATCH 1/3] automatic dump generation --- load_into_pg.py | 250 +++++++++++++++++++++++------------------------- 1 file changed, 122 insertions(+), 128 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index c65e854..1992625 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -1,10 +1,12 @@ #!/usr/bin/env python +#!/bin/sh import sys import time import argparse import psycopg2 as pg import row_processor as Processor import six +import subprocess # Special rules needed for certain tables (esp. for old database dumps) specialRules = { @@ -115,23 +117,118 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas -############################################################# -parser = argparse.ArgumentParser() -parser.add_argument( 'table' - , help = 'The table to work on.' - , choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments'] - ) +def get_keys(table): + if table == 'Users': + keys = [ + 'Id' + , 'Reputation' + , 'CreationDate' + , 'DisplayName' + , 'LastAccessDate' + , 'WebsiteUrl' + , 'Location' + , 'AboutMe' + , 'Views' + , 'UpVotes' + , 'DownVotes' + , 'ProfileImageUrl' + , 'Age' + , 'AccountId' + ] + elif table == 'Badges': + keys = [ + 'Id' + , 'UserId' + , 'Name' + , 'Date' + ] + elif table == 'PostLinks': + keys = [ + 'Id' + , 'CreationDate' + , 'PostId' + , 'RelatedPostId' + , 'LinkTypeId' + ] + elif table == 'Comments': + keys = [ + 'Id' + , 'PostId' + , 'Score' + , 'Text' + , 'CreationDate' + , 'UserId' + ] + elif table == 'Votes': + keys = [ + 'Id' + , 'PostId' + , 'VoteTypeId' + , 'UserId' + , 'CreationDate' + , 'BountyAmount' + ] + elif table == 'Posts': + keys = [ + 'Id' + , 'PostTypeId' + , 'AcceptedAnswerId' + , 'ParentId' + , 'CreationDate' + , 'Score' + , 'ViewCount' + , 'Body' + , 'OwnerUserId' + , 'LastEditorUserId' + , 'LastEditorDisplayName' + , 'LastEditDate' + , 'LastActivityDate' + , 'Title' + , 'Tags' + , 'AnswerCount' + , 'CommentCount' + , 'FavoriteCount' + , 'ClosedDate' + , 'CommunityOwnedDate' + ] + + # If the user has not explicitly asked for loading the body, we replace it with NULL + if not args.with_post_body: + specialRules[('Posts', 'Body')] = 'NULL' + + elif table == 'Tags': + keys = [ + 'Id' + , 'TagName' + , 'Count' + , 'ExcerptPostId' + , 'WikiPostId' + ] + elif table == 'PostHistory': + keys = [ + 'Id', + 'PostHistoryTypeId', + 'PostId', + 'RevisionGUID', + 'CreationDate', + 'UserId', + 'Text' + ] + elif table == 'Comments': + keys = [ + 'Id', + 'PostId', + 'Score', + 'Text', + 'CreationDate', + 'UserId', + ] + return keys -parser.add_argument( '-d', '--dbname' - , help = 'Name of database to create the table in. The database must exist.' - , default = 'stackoverflow' - ) +############################################################# -parser.add_argument( '-f', '--file' - , help = 'Name of the file to extract data from.' - , default = None - ) +parser = argparse.ArgumentParser() parser.add_argument( '-u', '--username' , help = 'Username for the database.' @@ -161,114 +258,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas args = parser.parse_args() -table = args.table -keys = None - -if table == 'Users': - keys = [ - 'Id' - , 'Reputation' - , 'CreationDate' - , 'DisplayName' - , 'LastAccessDate' - , 'WebsiteUrl' - , 'Location' - , 'AboutMe' - , 'Views' - , 'UpVotes' - , 'DownVotes' - , 'ProfileImageUrl' - , 'Age' - , 'AccountId' - ] -elif table == 'Badges': - keys = [ - 'Id' - , 'UserId' - , 'Name' - , 'Date' - ] -elif table == 'PostLinks': - keys = [ - 'Id' - , 'CreationDate' - , 'PostId' - , 'RelatedPostId' - , 'LinkTypeId' - ] -elif table == 'Comments': - keys = [ - 'Id' - , 'PostId' - , 'Score' - , 'Text' - , 'CreationDate' - , 'UserId' - ] -elif table == 'Votes': - keys = [ - 'Id' - , 'PostId' - , 'VoteTypeId' - , 'UserId' - , 'CreationDate' - , 'BountyAmount' - ] -elif table == 'Posts': - keys = [ - 'Id' - , 'PostTypeId' - , 'AcceptedAnswerId' - , 'ParentId' - , 'CreationDate' - , 'Score' - , 'ViewCount' - , 'Body' - , 'OwnerUserId' - , 'LastEditorUserId' - , 'LastEditorDisplayName' - , 'LastEditDate' - , 'LastActivityDate' - , 'Title' - , 'Tags' - , 'AnswerCount' - , 'CommentCount' - , 'FavoriteCount' - , 'ClosedDate' - , 'CommunityOwnedDate' - ] - - # If the user has not explicitly asked for loading the body, we replace it with NULL - if not args.with_post_body: - specialRules[('Posts', 'Body')] = 'NULL' - -elif table == 'Tags': - keys = [ - 'Id' - , 'TagName' - , 'Count' - , 'ExcerptPostId' - , 'WikiPostId' - ] -elif table == 'PostHistory': - keys = [ - 'Id', - 'PostHistoryTypeId', - 'PostId', - 'RevisionGUID', - 'CreationDate', - 'UserId', - 'Text' - ] -elif table == 'Comments': - keys = [ - 'Id', - 'PostId', - 'Score', - 'Text', - 'CreationDate', - 'UserId', - ] + try: # Python 2/3 compatibility @@ -276,10 +266,14 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas except NameError: pass -choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) - -if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password) -else: - six.print_("Cancelled.") +database_name= "emacs" +subprocess.call("./create_db.sh %s"%database_name,shell=True) +dump_folder="/home/befi8957/tmp/data/emacs.stackexchange.com/" +tables = ['Comments','PostHistory','Tags','Posts','Votes','Comments','PostLinks','Badges'] +for table in tables: + + file = dump_folder+table+".xml" + keys = get_keys(table) + handleTable(table, keys, database_name, file, args.host, args.port, args.username, args.password) + From 4b98bd267fda073fe36d50dcb42ac41182be53d3 Mon Sep 17 00:00:00 2001 From: Yamen Ajjour Date: Wed, 27 Dec 2017 19:44:19 +0100 Subject: [PATCH 2/3] adding create-db --- create_db.sh | 1 + 1 file changed, 1 insertion(+) create mode 100755 create_db.sh diff --git a/create_db.sh b/create_db.sh new file mode 100755 index 0000000..90f886c --- /dev/null +++ b/create_db.sh @@ -0,0 +1 @@ +psql postgres -c "create database $1" \ No newline at end of file From ef7bd7ce6ac6322905f60c01548fe780b1fa4255 Mon Sep 17 00:00:00 2001 From: Webis Date: Thu, 28 Dec 2017 22:36:42 +0100 Subject: [PATCH 3/3] adding stachexchange-sites --- get_site_names.py | 5 + load_into_pg.py | 2 +- stackexchange-sites.csv | 350 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 get_site_names.py create mode 100644 stackexchange-sites.csv diff --git a/get_site_names.py b/get_site_names.py new file mode 100644 index 0000000..9054983 --- /dev/null +++ b/get_site_names.py @@ -0,0 +1,5 @@ +import os + +dump_dir = "/media/webis20/corpora/corpora-thirdparty/corpora-stackexcahnge/http/" +for subdir, dirs, files in os.walk(dump_dir): + print subdir.replace(dump_dir,"") diff --git a/load_into_pg.py b/load_into_pg.py index 1992625..1a1a68b 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -268,7 +268,7 @@ def get_keys(table): database_name= "emacs" subprocess.call("./create_db.sh %s"%database_name,shell=True) -dump_folder="/home/befi8957/tmp/data/emacs.stackexchange.com/" +dump_folder="/media/webis20/corpora/corpora-thirdparty/corpora-stackexcahnge/http/english.stackexchange.com/" tables = ['Comments','PostHistory','Tags','Posts','Votes','Comments','PostLinks','Badges'] for table in tables: diff --git a/stackexchange-sites.csv b/stackexchange-sites.csv new file mode 100644 index 0000000..36d28b6 --- /dev/null +++ b/stackexchange-sites.csv @@ -0,0 +1,350 @@ + +poker.stackexchange.com +latin.meta.stackexchange.com +gis.meta.stackexchange.com +puzzling.stackexchange.com +windowsphone.stackexchange.com +worldbuilding.meta.stackexchange.com +gardening.meta.stackexchange.com +mathematica.stackexchange.com +scicomp.meta.stackexchange.com +ai.stackexchange.com +expressionengine.stackexchange.com +elementaryos.meta.stackexchange.com +economics.stackexchange.com +gaming.stackexchange.com +startups.meta.stackexchange.com +reverseengineering.stackexchange.com +magento.stackexchange.com +quant.meta.stackexchange.com +cs.meta.stackexchange.com +aviation.stackexchange.com +politics.meta.stackexchange.com +history.meta.stackexchange.com +meta.serverfault.com +webmasters.meta.stackexchange.com +craftcms.stackexchange.com +buddhism.stackexchange.com +musicfans.meta.stackexchange.com +vegetarianism.stackexchange.com +ja.stackoverflow.com +es.stackoverflow.com +money.stackexchange.com +stackoverflow.com-PostLinks +startups.stackexchange.com +parenting.stackexchange.com +webapps.stackexchange.com +english.stackexchange.com +pets.stackexchange.com +hinduism.stackexchange.com +esperanto.meta.stackexchange.com +academia.meta.stackexchange.com +mechanics.meta.stackexchange.com +astronomy.stackexchange.com +academia.stackexchange.com +matheducators.meta.stackexchange.com +meta.superuser.com +worldbuilding.stackexchange.com +tor.stackexchange.com +chess.stackexchange.com +workplace.stackexchange.com +earthscience.stackexchange.com +stackoverflow.com-Votes +devops.stackexchange.com +raspberrypi.meta.stackexchange.com +civicrm.stackexchange.com +astronomy.meta.stackexchange.com +meta.askubuntu.com +sound.stackexchange.com +cstheory.meta.stackexchange.com +patents.meta.stackexchange.com +islam.stackexchange.com +meta.pt.stackoverflow.com +ell.stackexchange.com +sqa.stackexchange.com +security.stackexchange.com +lifehacks.meta.stackexchange.com +retrocomputing.meta.stackexchange.com +devops.meta.stackexchange.com +meta.ru.stackoverflow.com +cs.stackexchange.com +portuguese.meta.stackexchange.com +health.stackexchange.com +tridion.meta.stackexchange.com +gis.stackexchange.com +anime.meta.stackexchange.com +christianity.meta.stackexchange.com +diy.meta.stackexchange.com +ethereum.stackexchange.com +mechanics.stackexchange.com +outdoors.stackexchange.com +sound.meta.stackexchange.com +diy.stackexchange.com +iot.meta.stackexchange.com +cseducators.stackexchange.com +tridion.stackexchange.com +tor.meta.stackexchange.com +ham.meta.stackexchange.com +movies.stackexchange.com +robotics.meta.stackexchange.com +hardwarerecs.stackexchange.com +literature.stackexchange.com +italian.meta.stackexchange.com +christianity.stackexchange.com +hsm.stackexchange.com +meta.mathoverflow.net +emacs.stackexchange.com +biology.meta.stackexchange.com +blender.stackexchange.com +meta.stackoverflow.com +chemistry.meta.stackexchange.com +italian.stackexchange.com +hermeneutics.meta.stackexchange.com +boardgames.meta.stackexchange.com +security.meta.stackexchange.com +russian.meta.stackexchange.com +health.meta.stackexchange.com +matheducators.stackexchange.com +cooking.meta.stackexchange.com +bicycles.meta.stackexchange.com +parenting.meta.stackexchange.com +fitness.meta.stackexchange.com +spanish.meta.stackexchange.com +sustainability.meta.stackexchange.com +rpg.stackexchange.com +ux.stackexchange.com +interpersonal.meta.stackexchange.com +pm.meta.stackexchange.com +bioinformatics.meta.stackexchange.com +computergraphics.meta.stackexchange.com +gaming.meta.stackexchange.com +stackapps.com +avp.stackexchange.com +scifi.meta.stackexchange.com +money.meta.stackexchange.com +judaism.meta.stackexchange.com +unix.meta.stackexchange.com +codereview.meta.stackexchange.com +salesforce.meta.stackexchange.com +avp.meta.stackexchange.com +bioinformatics.stackexchange.com +freelancing.stackexchange.com +codereview.stackexchange.com +gamedev.stackexchange.com +productivity.stackexchange.com +webapps.meta.stackexchange.com +travel.meta.stackexchange.com +french.meta.stackexchange.com +opendata.stackexchange.com +buddhism.meta.stackexchange.com +retrocomputing.stackexchange.com +bicycles.stackexchange.com +unix.stackexchange.com +sqa.meta.stackexchange.com +physics.meta.stackexchange.com +networkengineering.meta.stackexchange.com +rpg.meta.stackexchange.com +outdoors.meta.stackexchange.com +computergraphics.stackexchange.com +korean.stackexchange.com +politics.stackexchange.com +lifehacks.stackexchange.com +vi.meta.stackexchange.com +spanish.stackexchange.com +martialarts.meta.stackexchange.com +apple.stackexchange.com +mythology.meta.stackexchange.com +skeptics.stackexchange.com +arduino.meta.stackexchange.com +quant.stackexchange.com +movies.meta.stackexchange.com +ai.meta.stackexchange.com +moderators.stackexchange.com +softwareengineering.stackexchange.com +expressionengine.meta.stackexchange.com +reverseengineering.meta.stackexchange.com +latin.stackexchange.com +crypto.stackexchange.com +graphicdesign.meta.stackexchange.com +arduino.stackexchange.com +physics.stackexchange.com +german.stackexchange.com +boardgames.stackexchange.com +moderators.meta.stackexchange.com +cseducators.meta.stackexchange.com +coffee.stackexchange.com +opensource.stackexchange.com +space.meta.stackexchange.com +emacs.meta.stackexchange.com +ebooks.meta.stackexchange.com +coffee.meta.stackexchange.com +math.stackexchange.com +ebooks.stackexchange.com +magento.meta.stackexchange.com +softwarerecs.meta.stackexchange.com +softwarerecs.stackexchange.com +music.meta.stackexchange.com +photo.stackexchange.com +cogsci.meta.stackexchange.com +vi.stackexchange.com +{arabic} +anime.stackexchange.com +travel.stackexchange.com +meta.ukrainian.stackexchange.com +woodworking.stackexchange.com +networkengineering.stackexchange.com +hsm.meta.stackexchange.com +iot.stackexchange.com +arabic.meta.stackexchange.com +chinese.stackexchange.com +meta.ja.stackoverflow.com +music.stackexchange.com +meta.es.stackoverflow.com +philosophy.stackexchange.com +raspberrypi.stackexchange.com +stackoverflow.com-Tags +korean.meta.stackexchange.com +german.meta.stackexchange.com +datascience.stackexchange.com +windowsphone.meta.stackexchange.com +space.stackexchange.com +hermeneutics.stackexchange.com +homebrew.meta.stackexchange.com +rus.meta.stackexchange.com +electronics.stackexchange.com +drupal.stackexchange.com +stackoverflow.com-PostHistory +pt.stackoverflow.com +linguistics.stackexchange.com +apple.meta.stackexchange.com +law.meta.stackexchange.com +blender.meta.stackexchange.com +robotics.stackexchange.com +woodworking.meta.stackexchange.com +crypto.meta.stackexchange.com +cstheory.stackexchange.com +engineering.stackexchange.com +languagelearning.meta.stackexchange.com +earthscience.meta.stackexchange.com +graphicdesign.stackexchange.com +scicomp.stackexchange.com +rus.stackexchange.com +ux.meta.stackexchange.com +dba.meta.stackexchange.com +stats.stackexchange.com +augur.meta.stackexchange.com +sports.meta.stackexchange.com +crafts.stackexchange.com +esperanto.stackexchange.com +freelancing.meta.stackexchange.com +biology.stackexchange.com +beer.stackexchange.com +monero.meta.stackexchange.com +android.meta.stackexchange.com +ru.stackoverflow.com +chess.meta.stackexchange.com +android.stackexchange.com +musicfans.stackexchange.com +writers.meta.stackexchange.com +stackoverflow.com-Posts +softwareengineering.meta.stackexchange.com +genealogy.stackexchange.com +skeptics.meta.stackexchange.com +japanese.stackexchange.com +drupal.meta.stackexchange.com +philosophy.meta.stackexchange.com +joomla.stackexchange.com +3dprinting.meta.stackexchange.com +poker.meta.stackexchange.com +ukrainian.stackexchange.com +meta.stackexchange.com +tex.stackexchange.com +homebrew.stackexchange.com +bricks.stackexchange.com +electronics.meta.stackexchange.com +mathoverflow.net +ethereum.meta.stackexchange.com +expatriates.meta.stackexchange.com +cogsci.stackexchange.com +datascience.meta.stackexchange.com +codegolf.meta.stackexchange.com +chemistry.stackexchange.com +martialarts.stackexchange.com +3dprinting.stackexchange.com +aviation.meta.stackexchange.com +serverfault.com +superuser.com +joomla.meta.stackexchange.com +linguistics.meta.stackexchange.com +expatriates.stackexchange.com +literature.meta.stackexchange.com +patents.stackexchange.com +stats.meta.stackexchange.com +chinese.meta.stackexchange.com +craftcms.meta.stackexchange.com +writers.stackexchange.com +english.meta.stackexchange.com +japanese.meta.stackexchange.com +sustainability.stackexchange.com +economics.meta.stackexchange.com +sports.stackexchange.com +salesforce.stackexchange.com +bitcoin.meta.stackexchange.com +codegolf.stackexchange.com +puzzling.meta.stackexchange.com +augur.stackexchange.com +engineering.meta.stackexchange.com +sitecore.stackexchange.com +ham.stackexchange.com +crafts.meta.stackexchange.com +civicrm.meta.stackexchange.com +mythology.stackexchange.com +dsp.stackexchange.com +genealogy.meta.stackexchange.com +portuguese.stackexchange.com +french.stackexchange.com +hinduism.meta.stackexchange.com +mathematica.meta.stackexchange.com +scifi.stackexchange.com +ell.meta.stackexchange.com +languagelearning.stackexchange.com +sitecore.meta.stackexchange.com +history.stackexchange.com +judaism.stackexchange.com +beer.meta.stackexchange.com +math.meta.stackexchange.com +dsp.meta.stackexchange.com +cooking.stackexchange.com +opendata.meta.stackexchange.com +gamedev.meta.stackexchange.com +meta.vegetarianism.stackexchange.com +pm.stackexchange.com +opensource.meta.stackexchange.com +wordpress.stackexchange.com +stackoverflow.com-Users +pets.meta.stackexchange.com +workplace.meta.stackexchange.com +law.stackexchange.com +hardwarerecs.meta.stackexchange.com +stackoverflow.com-Badges +monero.stackexchange.com +arabic +sharepoint.meta.stackexchange.com +dba.stackexchange.com +photo.meta.stackexchange.com +interpersonal.stackexchange.com +webmasters.stackexchange.com +gardening.stackexchange.com +tex.meta.stackexchange.com +productivity.meta.stackexchange.com +askubuntu.com +bitcoin.stackexchange.com +stackoverflow.com-Comments +russian.stackexchange.com +bricks.meta.stackexchange.com +arabic.stackexchange.com +elementaryos.stackexchange.com +wordpress.meta.stackexchange.com +fitness.stackexchange.com +islam.meta.stackexchange.com +sharepoint.stackexchange.com