From c4bc2bb670091e876496298369db94ff596b2a1f Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 16:46:59 -0700 Subject: [PATCH 1/8] defaulting to elastic in quickstart --- docker/quickstart.sh | 16 ++++++++++++- metadata-ingestion/src/datahub/cli/docker.py | 25 +++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/docker/quickstart.sh b/docker/quickstart.sh index f22cdc2d2b0933..a02300b1faaef3 100755 --- a/docker/quickstart.sh +++ b/docker/quickstart.sh @@ -15,4 +15,18 @@ DEFAULT_VERSION=$(echo $TAG_VERSION | sed 's/undefined/head/') export DATAHUB_VERSION=${DATAHUB_VERSION:-${DEFAULT_VERSION}} echo "Quickstarting DataHub: version ${DATAHUB_VERSION}" -cd $DIR && docker-compose pull && docker-compose -p datahub up +if docker volume ls | grep -c -q datahub_neo4jdata +then + echo "Datahub Neo4j volume found, starting with neo4j as graph service" + cd $DIR && docker-compose pull && docker-compose -p datahub up + cd $DIR && docker-compose pull && docker-compose -p datahub up +else + echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service" + cd $DIR && \ + docker-compose \ + -f quickstart/docker-compose-without-neo4j.quickstart.yml \ + pull && \ + docker-compose -p datahub \ + -f quickstart/docker-compose-without-neo4j.quickstart.yml \ + up +fi diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index b248ed565b3ffa..fb8829e48fb092 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -17,11 +17,13 @@ ) from datahub.ingestion.run.pipeline import Pipeline -SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml" +NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml" +ELASTIC_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose-without-neo4j.quickstart.yml" BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master" -GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}" +GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" +GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" @@ -58,6 +60,20 @@ def check() -> None: docker_check_impl() +def check_neo4j_volume_exists(): + ps = subprocess.run(['docker', 'volume', 'ls'], check=True, capture_output=True) + output = subprocess.run(('grep', '-c', 'datahub_neo4jdata'), input=ps.stdout, capture_output=True) + + results = int(output.stdout.decode("utf-8").split('\n')[0]) + + if results > 0: + click.echo("Datahub Neo4j volume found, starting with neo4j as graph service") + return True + + click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service") + return False + + @docker.command() @click.option( "--version", @@ -110,12 +126,15 @@ def quickstart( ) # convert to list from tuple if not quickstart_compose_file: click.echo("Fetching docker-compose file from GitHub") + neo4j_volume_exists = check_neo4j_volume_exists() with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file: path = pathlib.Path(tmp_file.name) quickstart_compose_file.append(path) # Download the quickstart docker-compose file from GitHub. - quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL) + quickstart_download_response = requests.get( + GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL if check_neo4j_volume_exists else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL + ) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) From 9f8110cb5beaebb772022319b677529267d3f0de Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 17:59:02 -0700 Subject: [PATCH 2/8] adding elastic option to quickstart --- metadata-ingestion/src/datahub/cli/docker.py | 25 +++++++++++-------- .../src/datahub/cli/docker_check.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index fb8829e48fb092..5a2c2f3b49b3d7 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -61,17 +61,21 @@ def check() -> None: def check_neo4j_volume_exists(): - ps = subprocess.run(['docker', 'volume', 'ls'], check=True, capture_output=True) - output = subprocess.run(('grep', '-c', 'datahub_neo4jdata'), input=ps.stdout, capture_output=True) - - results = int(output.stdout.decode("utf-8").split('\n')[0]) + with get_client_with_error() as (client, error): + if error: + click.secho( + "Docker doesn't seem to be running. Did you start it?", fg="red" + ) + return - if results > 0: - click.echo("Datahub Neo4j volume found, starting with neo4j as graph service") - return True + if len(client.volumes.list(filters={"name": "datahub_neo4jdata"})) > 0: + click.echo("Datahub Neo4j volume found, starting with neo4j as graph service." + "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.") + return True - click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service") - return False + click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service." + "To use neo4j as a graph backend, ") + return False @docker.command() @@ -126,14 +130,13 @@ def quickstart( ) # convert to list from tuple if not quickstart_compose_file: click.echo("Fetching docker-compose file from GitHub") - neo4j_volume_exists = check_neo4j_volume_exists() with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file: path = pathlib.Path(tmp_file.name) quickstart_compose_file.append(path) # Download the quickstart docker-compose file from GitHub. quickstart_download_response = requests.get( - GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL if check_neo4j_volume_exists else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL + GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL if check_neo4j_volume_exists() else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL ) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index ee45b0feedf092..f67f47a8643f71 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -12,7 +12,6 @@ "schema-registry", "broker", "mysql", - "neo4j", "zookeeper", # These two containers are not necessary - only helpful in debugging. # "kafka-topics-ui", @@ -33,6 +32,7 @@ # We only add this container in some cases, but if it's present, we # definitely want to check that it exits properly. "mysql-setup", + "neo4j", ] # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it. From 5bc0a410f493fb5029e23014ccd495f679821a68 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 18:20:37 -0700 Subject: [PATCH 3/8] updating instructions --- metadata-ingestion/src/datahub/cli/docker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index 5a2c2f3b49b3d7..5a693d129663c5 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -74,7 +74,8 @@ def check_neo4j_volume_exists(): return True click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service." - "To use neo4j as a graph backend, ") + "To use neo4j as a graph backend, run with" + "`--quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`") return False From 1cd359cabc1b831498a9412fe14466e51f631578 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 20:40:58 -0700 Subject: [PATCH 4/8] improving logging --- metadata-ingestion/src/datahub/cli/docker.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index 5a693d129663c5..cf1a1f638f9725 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -69,13 +69,14 @@ def check_neo4j_volume_exists(): return if len(client.volumes.list(filters={"name": "datahub_neo4jdata"})) > 0: - click.echo("Datahub Neo4j volume found, starting with neo4j as graph service." - "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.") + click.echo("Datahub Neo4j volume found, starting with neo4j as graph service.\n" + "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.\n") return True - click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service." - "To use neo4j as a graph backend, run with" - "`--quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`") + click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service.\n" + "To use neo4j as a graph backend, run \n" + "`datahub docker quickstart --quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`" + "\nfrom the root of the datahub repo\n") return False From a8410c6b1860195a80a60b0b083f27f75791d127 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 21:10:47 -0700 Subject: [PATCH 5/8] removing unneeded line --- docker/quickstart.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/quickstart.sh b/docker/quickstart.sh index a02300b1faaef3..6d95c1147a505b 100755 --- a/docker/quickstart.sh +++ b/docker/quickstart.sh @@ -19,7 +19,6 @@ if docker volume ls | grep -c -q datahub_neo4jdata then echo "Datahub Neo4j volume found, starting with neo4j as graph service" cd $DIR && docker-compose pull && docker-compose -p datahub up - cd $DIR && docker-compose pull && docker-compose -p datahub up else echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service" cd $DIR && \ From f5ba852fa9b539ab1dca6e1977d71ccc6db67503 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 23 Jun 2021 21:35:17 -0700 Subject: [PATCH 6/8] lint issues --- metadata-ingestion/src/datahub/cli/docker.py | 36 ++++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index cf1a1f638f9725..37cf6386cbef4f 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -17,13 +17,21 @@ ) from datahub.ingestion.run.pipeline import Pipeline -NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml" -ELASTIC_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose-without-neo4j.quickstart.yml" +NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose.quickstart.yml" +) +ELASTIC_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose-without-neo4j.quickstart.yml" +) BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master" -GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" -GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" +GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" +) +GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" +) GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" @@ -69,14 +77,18 @@ def check_neo4j_volume_exists(): return if len(client.volumes.list(filters={"name": "datahub_neo4jdata"})) > 0: - click.echo("Datahub Neo4j volume found, starting with neo4j as graph service.\n" - "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.\n") + click.echo( + "Datahub Neo4j volume found, starting with neo4j as graph service.\n" + "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.\n" + ) return True - click.echo("No Datahub Neo4j volume found, starting with elasticsearch as graph service.\n" - "To use neo4j as a graph backend, run \n" - "`datahub docker quickstart --quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`" - "\nfrom the root of the datahub repo\n") + click.echo( + "No Datahub Neo4j volume found, starting with elasticsearch as graph service.\n" + "To use neo4j as a graph backend, run \n" + "`datahub docker quickstart --quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`" + "\nfrom the root of the datahub repo\n" + ) return False @@ -138,7 +150,9 @@ def quickstart( # Download the quickstart docker-compose file from GitHub. quickstart_download_response = requests.get( - GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL if check_neo4j_volume_exists() else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL + GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL + if check_neo4j_volume_exists() + else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL ) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) From c2cd984f4cac15ef77acb408b2d9d5051f763c67 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Thu, 24 Jun 2021 12:47:36 -0700 Subject: [PATCH 7/8] updating docs --- datahub-kubernetes/README.md | 9 +++++---- docker/README.md | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/datahub-kubernetes/README.md b/datahub-kubernetes/README.md index 79bd71ed540988..8e7750fb0e2991 100644 --- a/datahub-kubernetes/README.md +++ b/datahub-kubernetes/README.md @@ -34,12 +34,15 @@ The main components are powered by 4 external dependencies: - Kafka - Local DB (MySQL, Postgres, MariaDB) - Search Index (Elasticsearch) -- Graph Index (Supports only Neo4j) +- Graph Index (Supports either Neo4j or Elasticsearch) The dependencies must be deployed before deploying Datahub. We created a separate [chart](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/prerequisites) for deploying the dependencies with example configuration. They could also be deployed -separately on-prem or leveraged as managed services. +separately on-prem or leveraged as managed services. To remove your dependency on Neo4j, +remove Neo4j from the prerequisites `Chart.yaml` and `values.yaml` files. +Then, override the `graph_service_impl` field in `datahub-kubernetes/datahub/values.yaml` to +have the value `elasticsearch`. ## Quickstart Assuming kubectl context points to the correct kubernetes cluster, first create kubernetes secrets that contain MySQL and Neo4j passwords. @@ -130,5 +133,3 @@ to expose the 9002 port to the public. | helm uninstall datahub | Remove DataHub | | helm ls | List of Helm charts | | helm history | Fetch a release history | - - diff --git a/docker/README.md b/docker/README.md index 5bcb5ca4310ab1..c4908514d3d587 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,7 +32,7 @@ Dependencies: * [Kafka, Zookeeper, and Schema Registry](kafka-setup) * [Elasticsearch](elasticsearch-setup) * [MySQL](mysql) -* [Neo4j](neo4j) +* [(Optional) Neo4j](neo4j) ### Ingesting demo data. From 79a99ef17e6fc949a72d75cd160f29c389542c2c Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Thu, 24 Jun 2021 15:17:44 -0700 Subject: [PATCH 8/8] updating helm instructions --- datahub-kubernetes/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datahub-kubernetes/README.md b/datahub-kubernetes/README.md index 8e7750fb0e2991..9283c6e6e6780d 100644 --- a/datahub-kubernetes/README.md +++ b/datahub-kubernetes/README.md @@ -40,9 +40,9 @@ The dependencies must be deployed before deploying Datahub. We created a separat [chart](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/prerequisites) for deploying the dependencies with example configuration. They could also be deployed separately on-prem or leveraged as managed services. To remove your dependency on Neo4j, -remove Neo4j from the prerequisites `Chart.yaml` and `values.yaml` files. +set enabled to false in the `datahub-kubernetes/prerequisites/values.yaml` file. Then, override the `graph_service_impl` field in `datahub-kubernetes/datahub/values.yaml` to -have the value `elasticsearch`. +have the value `elasticsearch` instead of `neo4j`. ## Quickstart Assuming kubectl context points to the correct kubernetes cluster, first create kubernetes secrets that contain MySQL and Neo4j passwords.