diff --git a/datahub-kubernetes/README.md b/datahub-kubernetes/README.md index 79bd71ed540988..9283c6e6e6780d 100644 --- a/datahub-kubernetes/README.md +++ b/datahub-kubernetes/README.md @@ -34,12 +34,15 @@ The main components are powered by 4 external dependencies: - Kafka - Local DB (MySQL, Postgres, MariaDB) - Search Index (Elasticsearch) -- Graph Index (Supports only Neo4j) +- Graph Index (Supports either Neo4j or Elasticsearch) The dependencies must be deployed before deploying Datahub. We created a separate [chart](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/prerequisites) for deploying the dependencies with example configuration. They could also be deployed -separately on-prem or leveraged as managed services. +separately on-prem or leveraged as managed services. To remove your dependency on Neo4j, +set enabled to false in the `datahub-kubernetes/prerequisites/values.yaml` file. +Then, override the `graph_service_impl` field in `datahub-kubernetes/datahub/values.yaml` to +have the value `elasticsearch` instead of `neo4j`. ## Quickstart Assuming kubectl context points to the correct kubernetes cluster, first create kubernetes secrets that contain MySQL and Neo4j passwords. @@ -130,5 +133,3 @@ to expose the 9002 port to the public. | helm uninstall datahub | Remove DataHub | | helm ls | List of Helm charts | | helm history | Fetch a release history | - - diff --git a/docker/README.md b/docker/README.md index 5bcb5ca4310ab1..c4908514d3d587 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,7 +32,7 @@ Dependencies: * [Kafka, Zookeeper, and Schema Registry](kafka-setup) * [Elasticsearch](elasticsearch-setup) * [MySQL](mysql) -* [Neo4j](neo4j) +* [(Optional) Neo4j](neo4j) ### Ingesting demo data. diff --git a/docker/quickstart.sh b/docker/quickstart.sh index f22cdc2d2b0933..6d95c1147a505b 100755 --- a/docker/quickstart.sh +++ b/docker/quickstart.sh @@ -15,4 +15,17 @@ DEFAULT_VERSION=$(echo $TAG_VERSION | sed 's/undefined/head/') export DATAHUB_VERSION=${DATAHUB_VERSION:-${DEFAULT_VERSION}} echo "Quickstarting DataHub: version ${DATAHUB_VERSION}" -cd $DIR && docker-compose pull && docker-compose -p datahub up +if docker volume ls | grep -c -q datahub_neo4jdata +then + echo "Datahub Neo4j volume found, starting with neo4j as graph service" + cd $DIR && docker-compose pull && docker-compose -p datahub up +else + echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service" + cd $DIR && \ + docker-compose \ + -f quickstart/docker-compose-without-neo4j.quickstart.yml \ + pull && \ + docker-compose -p datahub \ + -f quickstart/docker-compose-without-neo4j.quickstart.yml \ + up +fi diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index b248ed565b3ffa..37cf6386cbef4f 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -17,11 +17,21 @@ ) from datahub.ingestion.run.pipeline import Pipeline -SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml" +NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose.quickstart.yml" +) +ELASTIC_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose-without-neo4j.quickstart.yml" +) BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master" -GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}" +GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" +) +GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" +) GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" @@ -58,6 +68,30 @@ def check() -> None: docker_check_impl() +def check_neo4j_volume_exists(): + with get_client_with_error() as (client, error): + if error: + click.secho( + "Docker doesn't seem to be running. Did you start it?", fg="red" + ) + return + + if len(client.volumes.list(filters={"name": "datahub_neo4jdata"})) > 0: + click.echo( + "Datahub Neo4j volume found, starting with neo4j as graph service.\n" + "If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.\n" + ) + return True + + click.echo( + "No Datahub Neo4j volume found, starting with elasticsearch as graph service.\n" + "To use neo4j as a graph backend, run \n" + "`datahub docker quickstart --quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`" + "\nfrom the root of the datahub repo\n" + ) + return False + + @docker.command() @click.option( "--version", @@ -115,7 +149,11 @@ def quickstart( quickstart_compose_file.append(path) # Download the quickstart docker-compose file from GitHub. - quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL) + quickstart_download_response = requests.get( + GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL + if check_neo4j_volume_exists() + else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL + ) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index ee45b0feedf092..f67f47a8643f71 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -12,7 +12,6 @@ "schema-registry", "broker", "mysql", - "neo4j", "zookeeper", # These two containers are not necessary - only helpful in debugging. # "kafka-topics-ui", @@ -33,6 +32,7 @@ # We only add this container in some cases, but if it's present, we # definitely want to check that it exits properly. "mysql-setup", + "neo4j", ] # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.