-
Notifications
You must be signed in to change notification settings - Fork 2
Eng 376 Benchmark Concept queries #483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
maparent
wants to merge
11
commits into
main
Choose a base branch
from
eng-376-run-simulation-on-db-schema
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
9572e7d
WIP: send files without automation
maparent 109c32d
wip
maparent 69742b8
name queries
maparent e6cc447
add content for nodes
maparent 82434d9
small bug on generation
maparent e3bc968
consequence of correction
maparent 848928a
fetch schemas first
maparent 2bbf23f
use meaningful local_ids for fake data
maparent 4a126f5
another side effect
maparent 5d657af
coderabbit comments
maparent e53c99a
linting
maparent File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,299 @@ | ||
#!/usr/bin/env python3 | ||
import sys | ||
from collections import defaultdict | ||
from datetime import datetime | ||
from json import dumps | ||
from random import randint | ||
from subprocess import run | ||
|
||
from urllib3.util.url import Url, parse_url | ||
from yaml import safe_load | ||
|
||
|
||
def as_bool(value): | ||
if type(value) is bool: | ||
return value | ||
return str(value).lower() in {"true", "yes", "on", "1", "checked"} | ||
|
||
|
||
def psql_command(url: Url, command, cmdfile=None, debug=False, sudo=False, db=None): | ||
if debug: | ||
print(command) | ||
assert ":" in url.auth, "Please provide the password in the postgres URL." | ||
user, password = url.auth.split(":", 1) | ||
host = url.hostname | ||
port = url.port | ||
db = db or url.path.strip("/") | ||
|
||
assert not (sudo and password) | ||
if password: | ||
conn = ["psql", f"postgresql://{user}:{password}@{host}:{port}/{db}"] | ||
else: | ||
if sudo: | ||
conn = ["sudo", "-u", user, "psql"] | ||
else: | ||
conn = ["psql", "-U", user] | ||
conn.append(db) | ||
if not sudo and host != "localhost": | ||
conn.extend(["-h", host]) | ||
conn.extend(["-q", "--csv", "-t", "-n"]) | ||
if port != 5432: | ||
conn.extend(["-p", str(port)]) | ||
if command: | ||
conn.extend(["-c", command]) | ||
if cmdfile: | ||
conn.extend(["-f", cmdfile]) | ||
r = run(conn, capture_output=True, encoding="utf-8") | ||
if debug: | ||
print(r.returncode, r.stdout, r.stderr) | ||
assert not r.returncode | ||
assert "ERROR" not in r.stderr, r.stderr | ||
return r.stdout | ||
|
||
|
||
def init_database(url, schemas): | ||
db = url.path.strip("/") | ||
if schemas: | ||
# This is intended for a local postgres, not for a local supabase. | ||
psql_command(url, f"drop database if exists {db};", db="postgres") | ||
psql_command(url, f"create database {db};", db="postgres") | ||
for schema in schemas: | ||
psql_command(url, None, schema) | ||
else: | ||
# Clear data | ||
psql_command(url, 'truncate "Concept" CASCADE') | ||
# psql_command(url, 'truncate "Content" CASCADE') | ||
# psql_command(url, 'truncate "Document" CASCADE') | ||
# psql_command(url, 'truncate "AgentIdentifier" CASCADE') | ||
psql_command(url, 'truncate "PlatformAccount" CASCADE') | ||
psql_command(url, 'truncate "Space" CASCADE') | ||
|
||
|
||
def generate_space(url): | ||
result = psql_command( | ||
url, | ||
"""insert into public."Space" (url, name, platform) values ('test', 'test', 'Roam') RETURNING id; """, | ||
) | ||
print("Space:", result) | ||
return int(result) | ||
|
||
|
||
def generate_accounts(url, num_accounts, space_id): | ||
accounts = [ | ||
dict(account_local_id=f"account_{i}", name=f"account_{i}") | ||
for i in range(num_accounts) | ||
] | ||
result = psql_command( | ||
url, | ||
f"select upsert_accounts_in_space({space_id}, $json${dumps(accounts)}$json$)", | ||
debug=False, | ||
) | ||
nums = [int(i) for i in result.split()] | ||
accounts = {i: d | dict(id=i) for (i, d) in zip(nums, accounts)} | ||
print("Accounts:", ", ".join(str(a) for a in accounts.keys())) | ||
return accounts | ||
|
||
|
||
def generate_content( | ||
url, space_id, accounts, target_num: int, names=None, prefix="content" | ||
): | ||
account_ids = list(accounts.keys()) | ||
num_accounts = len(account_ids) | ||
now = datetime.now().isoformat() | ||
inames = iter(names) if names else (f"{prefix}_{i}" for i in range(target_num)) | ||
|
||
def make_content(): | ||
account_id = account_ids[randint(0, num_accounts - 1)] | ||
page_local_id = next(inames) | ||
return dict( | ||
text=page_local_id, | ||
source_local_id=page_local_id, | ||
created=now, | ||
last_modified=now, | ||
space_id=space_id, | ||
author_id=account_id, | ||
document_inline=dict( | ||
source_local_id=page_local_id, | ||
created=now, | ||
last_modified=now, | ||
author_id=account_id, | ||
), | ||
) | ||
|
||
all_content = [] | ||
for b in range(0, target_num, 500): | ||
content = [make_content() for i in range(b, min(b + 500, target_num))] | ||
result = psql_command( | ||
url, | ||
f"select upsert_content({space_id}, $json${dumps(content)}$json$, null);", | ||
) | ||
for i, n in enumerate(result.split()): | ||
content[i]["id"] = int(n) | ||
all_content.extend(content) | ||
print("Content:", ", ".join(str(c["id"]) for c in all_content)) | ||
return all_content | ||
|
||
|
||
def generate_concept_schemata(url, space_id, accounts, node_specs, relation_specs): | ||
now = datetime.now().isoformat() | ||
all_specs = node_specs + relation_specs | ||
content_list = generate_content( | ||
url, space_id, accounts, len(all_specs), names=[s["name"] for s in all_specs] | ||
) | ||
content_iter = iter(content_list) | ||
|
||
def make_concept_schema(name, content, is_relation): | ||
return dict( | ||
name=name, | ||
created=now, | ||
last_modified=now, | ||
space_id=space_id, | ||
author_id=content["author_id"], | ||
represented_by_id=content["id"], | ||
is_schema=True, | ||
literal_content=dict(roles=["source", "target"]) if is_relation else dict(), | ||
) | ||
|
||
node_schemas = [ | ||
make_concept_schema(schema["name"], next(content_iter), False) | ||
for schema in node_specs | ||
] | ||
relation_schemas = [ | ||
make_concept_schema(schema["name"], next(content_iter), True) | ||
for schema in relation_specs | ||
] | ||
schemata = node_schemas + relation_schemas | ||
result = psql_command( | ||
url, | ||
f"select upsert_concepts({space_id}, $json${dumps(schemata)}$json$);", | ||
) | ||
nums = result.split() | ||
for i, schema in enumerate(schemata): | ||
schema["id"] = int(nums[i]) | ||
node_schemas_by_name = {s["name"]: s for s in node_schemas} | ||
relation_schemas_by_name = {s["name"]: s for s in relation_schemas} | ||
print("Schema nodes", ", ".join(f"{s['name']}: {s['id']}" for s in schemata)) | ||
return node_schemas_by_name, relation_schemas_by_name | ||
|
||
|
||
def generate_concept_nodes(url, space_id, accounts, node_schemas, node_specs): | ||
now = datetime.now().isoformat() | ||
|
||
def make_node(name, content, schema_id): | ||
return dict( | ||
name=name, | ||
created=now, | ||
last_modified=now, | ||
space_id=space_id, | ||
author_id=content["author_id"], | ||
represented_by_id=content["id"], | ||
schema_id=schema_id, | ||
) | ||
|
||
all_nodes = [] | ||
for schema in node_specs: | ||
target_num = schema["count"] | ||
schema_id = node_schemas[schema["name"]]["id"] | ||
content_list = generate_content( | ||
url, space_id, accounts, target_num, prefix=schema["name"] | ||
) | ||
content_iter = iter(content_list) | ||
for b in range(0, target_num, 500): | ||
local_target_num = min(b + 500, target_num) | ||
nodes = [ | ||
make_node(f"{schema['name']}_{i}", next(content_iter), schema_id) | ||
for i in range(b, local_target_num) | ||
] | ||
result = psql_command( | ||
url, f"select upsert_concepts({space_id}, $json${dumps(nodes)}$json$);" | ||
) | ||
nums = result.split() | ||
for i, node in enumerate(nodes): | ||
node["id"] = int(nums[i]) | ||
all_nodes.extend(nodes) | ||
print("Nodes:", ", ".join(str(n["id"]) for n in all_nodes)) | ||
return all_nodes | ||
|
||
|
||
def generate_relations( | ||
url, space_id, accounts, node_schemas, reln_schemas, nodes, relation_specs | ||
): | ||
account_ids = list(accounts.keys()) | ||
num_accounts = len(account_ids) | ||
now = datetime.now().isoformat() | ||
|
||
def random_account(): | ||
return account_ids[randint(0, num_accounts - 1)] | ||
|
||
node_ids_by_type = defaultdict(list) | ||
schema_name_by_id = {s["id"]: s["name"] for s in node_schemas.values()} | ||
for node in nodes: | ||
node_ids_by_type[schema_name_by_id[node["schema_id"]]].append(node["id"]) | ||
|
||
def random_node(schema_name): | ||
if isinstance(schema_name, list): | ||
schema_name = schema_name[randint(0, len(schema_name) - 1)] | ||
node_ids = node_ids_by_type[schema_name] | ||
return node_ids[randint(0, len(node_ids) - 1)] | ||
|
||
all_relns = [] | ||
for schema in relation_specs: | ||
target_num = schema["count"] | ||
schema_id = reln_schemas[schema["name"]]["id"] | ||
roles = schema["roles"] | ||
for b in range(0, target_num, 500): | ||
relns = [ | ||
dict( | ||
name=f"{schema['name']}_{i}", | ||
created=now, | ||
last_modified=now, | ||
space_id=space_id, | ||
author_id=random_account(), | ||
reference_content={k: random_node(v) for (k, v) in roles.items()}, | ||
schema_id=schema_id, | ||
) | ||
for i in range(b, min(b + 500, target_num)) | ||
] | ||
result = psql_command( | ||
url, | ||
f"select upsert_concepts({space_id}, $json${dumps(relns)}$json$);", | ||
) | ||
nums = result.split() | ||
for i, reln in enumerate(relns): | ||
reln["id"] = int(nums[i]) | ||
all_relns.extend(relns) | ||
print("Relations:", ", ".join(str(r["id"]) for r in all_relns)) | ||
return all_relns | ||
|
||
|
||
def main(fname): | ||
with open(fname) as f: | ||
params = safe_load(f) | ||
url: Url = parse_url(params["database_url"]) | ||
init_database(url, params.get("schemas", [])) | ||
space_id = generate_space(url) | ||
accounts = generate_accounts(url, params["accounts"]["count"], space_id) | ||
node_schemas_by_name, relation_schemas_by_name = generate_concept_schemata( | ||
url, space_id, accounts, params["nodes"], params["relations"] | ||
) | ||
nodes = generate_concept_nodes( | ||
url, | ||
space_id, | ||
accounts, | ||
node_schemas_by_name, | ||
params["nodes"], | ||
) | ||
_relations = generate_relations( | ||
url, | ||
space_id, | ||
accounts, | ||
node_schemas_by_name, | ||
relation_schemas_by_name, | ||
nodes, | ||
params["relations"], | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
fname = sys.argv[-1] | ||
main(fname) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
schemas: | ||
# A mechanism to try out different schemas, eventually. | ||
# - ./benchmark_schemas/prefix.sql | ||
# - ./supabase/schemas/base.sql | ||
# - ./supabase/schemas/space.sql | ||
# - ./supabase/schemas/account.sql | ||
# - ./supabase/schemas/content.sql | ||
# - ./supabase/schemas/embedding.sql | ||
# - ./supabase/schemas/concept.sql | ||
# - ./supabase/schemas/contributor.sql | ||
# - ./supabase/schemas/sync.sql | ||
# - ./supabase/schemas/access_token.sql | ||
database_url: postgresql://postgres:[email protected]:54322/postgres | ||
accounts: | ||
count: 100 | ||
nodes: | ||
- name: claim | ||
count: 50000 | ||
- name: hypothesis | ||
count: 50000 | ||
relations: | ||
- name: supports | ||
roles: | ||
source: claim | ||
target: [claim, hypothesis] | ||
count: 50000 | ||
- name: opposes | ||
roles: | ||
source: claim | ||
target: [claim, hypothesis] | ||
count: 50000 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.