diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 00000000..1b2a9700 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,13 @@ +# +# Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# + +# Add label to any changes within the folder or any subfolders +xinfra-monitor: + - src/main/java/com/linkedin/xinfra/monitor/**/* diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..8b410884 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,27 @@ +name: CI + +on: + pull_request: + types: ['opened', 'synchronize'] + +jobs: + ci: + name: ci + strategy: + matrix: + version: ['11.0.13'] + dist: ['microsoft'] + + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: set up JDK ${{matrix.version}} (${{matrix.dist}}) + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.version }} + distribution: ${{ matrix.dist }} + - name: test + run: ./gradlew --info test --no-daemon diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml new file mode 100644 index 00000000..aa455d62 --- /dev/null +++ b/.github/workflows/greetings.yml @@ -0,0 +1,17 @@ +name: Greetings + +on: + pull_request: + types: ['opened'] + issues: + types: ['opened'] + +jobs: + greeting: + runs-on: ubuntu-latest + steps: + - uses: actions/first-interaction@v1 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + issue-message: 'This is your first issue in the repository. Thank you for raising this issue.'' first issue' + pr-message: 'This is your first pull request in the repository. Thank you for this patch. Please review the Wiki page in the repository before submitting a PR.'' first pr' diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml new file mode 100644 index 00000000..96317667 --- /dev/null +++ b/.github/workflows/label.yml @@ -0,0 +1,25 @@ +# This workflow will triage pull requests and apply a label based on the +# paths that are modified in the pull request. +# +# To use this workflow, you will need to set up a .github/labeler.yml +# file with configuration. For more information, see: +# https://github.com/actions/labeler/blob/master/README.md + +name: "Pull Request Labeler" +on: + - pull_request + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v3-preview + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + + + + + + + diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 00000000..34045177 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,19 @@ +name: Mark stale issues and pull requests + +on: + schedule: + - cron: "30 1 * * *" + +jobs: + stale: + + runs-on: ubuntu-latest + + steps: + - uses: actions/stale@v1 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: 'Stale issue message' + stale-pr-message: 'Stale pull request message' + stale-issue-label: 'no-issue-activity' + stale-pr-label: 'no-pr-activity' diff --git a/.github/workflows/tag.yml b/.github/workflows/tag.yml new file mode 100644 index 00000000..e5356484 --- /dev/null +++ b/.github/workflows/tag.yml @@ -0,0 +1,35 @@ +name: tag (release) flow + +on: + create: + tags: + - '*' + +jobs: + gradle-java8: + name: Java 8 release + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v3 + with: + # bring in all history because the gradle versions plugin needs to "walk back" to the closest ancestor tag + # to figure out what version this is. optimizing this is left as a challenge to future committers + fetch-depth: 0 + - name: Set up JDK + uses: actions/setup-java@v3 + with: + java-version: 11 + distribution: microsoft + - name: Build with Gradle + # add --info or --debug below for more details when trying to understand issues + run: ./gradlew clean build javadoc --stacktrace --warning-mode all --no-daemon + - name: Branch tag + id: branch_tag + run: echo "RELEASE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + - name: Publish to Jfrog + env: + JFROG_USER: ${{ secrets.JFROG_USER }} + JFROG_KEY: ${{ secrets.JFROG_KEY }} + RELEASE_TAG: ${{ steps.branch_tag.outputs.RELEASE_TAG }} + run: ./scripts/publishToJfrog.sh diff --git a/.gitignore b/.gitignore index b8ae435e..5b4afd4a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,15 @@ .DS_Store build/ logs/ +.classpath +.idea/ +.project +.settings/ +src/test/java/com/linkedin/xinfra/monitor/RandomTests.java + +config/andrew-choi.properties +config/andrew-multi-cluster-monitor.properties + +kafka-monitor.iml +kafka-monitor.ipr +kafka-monitor.iws diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 0d5889e9..00000000 --- a/.travis.yml +++ /dev/null @@ -1,6 +0,0 @@ -language: java - -jdk: - - oraclejdk7 - - openjdk7 - - oraclejdk8 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..118ac0fd --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at andchoi@linkedin.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +## FAQ + +For LinkedIn Code of Conduct (OSS Code of Conduct) issues or inquiries, Global Compliance & Integrity inbox, please email integrity@linkedin.com. + + + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7d1d7f31 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,27 @@ +Contribution Agreement +====================== + +As a contributor, you represent that the code you submit is your +original work or that of your employer (in which case you represent you +have the right to bind your employer). By submitting code, you (and, if +applicable, your employer) are licensing the submitted code to LinkedIn +and the open source community subject to the Apache 2.0 license. + +Responsible Disclosure of Security Vulnerabilities +================================================== + +Please do not file reports on Github for security issues. +Please review the guidelines on at +https://www.linkedin.com/help/linkedin/answer/62924/security-vulnerabilities?lang=en + +Tips for Getting Your Pull Request Accepted +=========================================== + +1. Make sure all new features are tested and the tests pass. +2. Bug fixes must include a test case demonstrating the error that it fixes. + +Reporting Issues +=============== +Please use the [link](https://github.com/linkedin/kafka-monitor/issues/new) for reporting any issues. + + diff --git a/LICENSE b/LICENSE index 02c5bb4d..0d5476d3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -187,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2016 LinkedIn Corp. All rights reserved. + Copyright 2016, 2017, 2018, 2019, 2020, 2021 LinkedIn Corp. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/NOTICE b/NOTICE index ee9a4345..d02fd50c 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,4 @@ -Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +Copyright 2016, 2017, 2018, 2019, 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -34,18 +34,3 @@ License: http://www.json.org/license.html This product includes/uses JUnit (https://http://junit.org/) Copyright 2002-2016 JUnit License: Eclipse Public License 1.0 - - - - - - - - - - - - - - - diff --git a/README.md b/README.md index 7b5ba295..8313ba42 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,186 @@ -# Kafka Monitor +

+ +

+# Xinfra Monitor [![Build Status](https://travis-ci.org/linkedin/kafka-monitor.svg?branch=master)](https://travis-ci.org/linkedin/kafka-monitor) +![Greetings](https://github.com/linkedin/kafka-monitor/workflows/Greetings/badge.svg) +![Mark stale issues and pull requests](https://github.com/linkedin/kafka-monitor/workflows/Mark%20stale%20issues%20and%20pull%20requests/badge.svg) +![Pull Request Labeler](https://github.com/linkedin/kafka-monitor/workflows/Pull%20Request%20Labeler/badge.svg) -Kafka Monitor is a framework to implement and execute long-running kafka +Xinfra Monitor (formerly Kafka Monitor) is a framework to implement and execute long-running kafka system tests in a real cluster. It complements Kafka’s existing system tests by capturing potential bugs or regressions that are only likely to occur after prolonged period of time or with low probability. Moreover, it allows you to monitor Kafka cluster using end-to-end pipelines to obtain a number of derived vital stats -such as end-to-end latency, service availability and message loss rate. You can easily -deploy Kafka Monitor to test and monitor your Kafka cluster without requiring +such as + +
    +
  1. + End-to-end latency +
  2. +
  3. + Service availability +
  4. +
  5. + Produce and Consume availability +
  6. +
  7. + Consumer offset commit availability +
  8. +
  9. + Consumer offset commit latency +
  10. +
  11. + Kafka message loss rate +
  12. +
  13. + And many, many more. +
  14. +
+ +You can easily +deploy Xinfra Monitor to test and monitor your Kafka cluster without requiring any change to your application. -Kafka Monitor can automatically create the monitor topic with the specified config +Xinfra Monitor can automatically create the monitor topic with the specified config and increase partition count of the monitor topic to ensure partition# >= broker#. It can also reassign partition and trigger preferred leader election to ensure that each broker acts as leader of at least one partition of the -monitor topic. This allows Kafka Monitor to detect performance issue on every +monitor topic. This allows Xinfra Monitor to detect performance issue on every broker without requiring users to manually manage the partition assignment of the monitor topic. +Xinfra Monitor is used in conjunction with different middle-layer services such as li-apache-kafka-clients in order to monitor single clusters, pipeline desination clusters, and other types of clusters as done in Linkedin engineering for real-time cluster healthchecks. + +These are some of the metrics emitted from a Xinfra Monitor instance. + +``` +kmf:type=kafka-monitor:offline-runnable-count +kmf.services:type=produce-service,name=*:produce-availability-avg +kmf.services:type=consume-service,name=*:consume-availability-avg +kmf.services:type=produce-service,name=*:records-produced-total +kmf.services:type=consume-service,name=*:records-consumed-total +kmf.services:type=produce-service,name=*:records-produced-rate +kmf.services:type=produce-service,name=*:produce-error-rate +kmf.services:type=consume-service,name=*:consume-error-rate +kmf.services:type=consume-service,name=*:records-lost-total +kmf.services:type=consume-service,name=*:records-lost-rate +kmf.services:type=consume-service,name=*:records-duplicated-total +kmf.services:type=consume-service,name=*:records-delay-ms-avg +kmf.services:type=commit-availability-service,name=*:offsets-committed-avg +kmf.services:type=commit-availability-service,name=*:offsets-committed-total +kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg +kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total +kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg +kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max +kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th +kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th +kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th +``` + ## Getting Started ### Prerequisites -Kafka Monitor requires Gradle 2.0 or higher. Java 7 should be used for +Xinfra Monitor requires Gradle 2.0 or higher. Java 7 should be used for building in order to support both Java 7 and Java 8 at runtime. -Kafka Monitor supports Apache Kafka 0.8 and 0.9. Use branch 0.8.2.2 to monitor Apache -Kafka cluster 0.8. Use branch 0.9.0.1 to compile with Kafka 0.9. Use master -branch to compile with Kafka 0.10. +Xinfra Monitor supports Apache Kafka 0.8 to 2.0: +- Use branch 0.8.2.2 to work with Apache Kafka 0.8 +- Use branch 0.9.0.1 to work with Apache Kafka 0.9 +- Use branch 0.10.2.1 to work with Apache Kafka 0.10 +- Use branch 0.11.x to work with Apache Kafka 0.11 +- Use branch 1.0.x to work with Apache Kafka 1.0 +- Use branch 1.1.x to work with Apache Kafka 1.1 +- Use master branch to work with Apache Kafka 2.0 + ### Configuration Tips -- We advise advanced users to run Kafka Monitor with -`./bin/kafka-monitor-start.sh config/kafka-monitor.properties`. The default -kafka-monitor.properties in the repo provides an simple example of how to +
    +
  1. We advise advanced users to run Xinfra Monitor with +./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties. The default +xinfra-monitor.properties in the repo provides an simple example of how to monitor a single cluster. You probably need to change the value of -`zookeeper.connect` and `bootstrap.servers` to point to your cluster. - -- The full list of configs and their documentation can be found in the code of +zookeeper.connect and bootstrap.servers to point to your cluster. +
  2. +
    +
  3. The full list of configs and their documentation can be found in the code of Config class for respective service, e.g. ProduceServiceConfig.java and -ConsumeServiceConfig.java. - -- You can specify multiple SingleClusterMonitor in the kafka-monitor.properties to -monitor multiple Kafka clusters in one Kafka Monitor process. As another -advanced use-cse, you can point ProduceService and ConsumeService to two -different Kafka clusters that are connected by MirrorMaker to monitor their -end-to-end latency. - -- Kafka Monitor by default will automatically create the monitor topic based on -the e.g. `topic-management.replicationFactor` and `topic-management.partitionsToBrokersRatio` +ConsumeServiceConfig.java.
  4. +
    +
  5. You can specify multiple SingleClusterMonitor in the xinfra-monitor.properties to +monitor multiple Kafka clusters in one Xinfra Monitor process. As another +advanced use-case, you can point ProduceService and ConsumeService to two different Kafka clusters that are connected by MirrorMaker to monitor their end-to-end latency.
  6. +
    +
  7. Xinfra Monitor by default will automatically create the monitor topic based on +the e.g. topic-management.replicationFactor and topic-management.partitionsToBrokersRatio specified in the config. replicationFactor is 1 by default and you probably want to change it to the same replication factor as used for your existing -topics. You can disable auto topic creation by setting `produce.topic.topicCreationEnabled` to false. - -- Kafka Monitor can automatically increase partition count of the monitor topic +topics. You can disable auto topic creation by setting produce.topic.topicCreationEnabled to false. +
  8. +
    +
  9. Xinfra Monitor can automatically increase partition count of the monitor topic to ensure partition# >= broker#. It can also reassign partition and trigger preferred leader election to ensure that each broker acts as leader of at least one partition of the monitor topic. To use this feature, use either -EndToEndTest or TopicManagementService in the properties file. - - -### Build Kafka Monitor +EndToEndTest or TopicManagementService in the properties file.
  10. +
    +
  11. When using Secure Sockets Layer (SSL) or any non-plaintext security protocol for AdminClient, please configure the following entries in the single-cluster-monitor props, produce.producer.props, as well as consume.consumer.props. https://docs.confluent.io/current/installation/configuration/admin-configs.html +
      +
    1. ssl.key.password
    2. +
    3. ssl.keystore.location
    4. +
    5. ssl.keystore.password
    6. +
    7. ssl.truststore.location
    8. +
    9. ssl.truststore.password
    10. +
    +
+ + +### Build Xinfra Monitor ``` $ git clone https://github.com/linkedin/kafka-monitor.git $ cd kafka-monitor $ ./gradlew jar ``` -### Start KafkaMonitor to run tests/services specified in the config file +### Start XinfraMonitor to run tests/services specified in the config file +``` +$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties +``` + +### Run Xinfra Monitor with arbitrary producer/consumer configuration (e.g. SASL enabled client) +Edit `config/xinfra-monitor.properties` to specify custom configurations for producer in the key/value map `produce.producer.props` in +`config/xinfra-monitor.properties`. Similarly specify configurations for +consumer as well. The documentation for producer and consumer in the key/value maps can be found in the Apache Kafka wiki. + ``` -$ ./bin/kafka-monitor-start.sh config/kafka-monitor.properties +$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties ``` ### Run SingleClusterMonitor app to monitor kafka cluster + +Metrics `produce-availability-avg` and `consume-availability-avg` demonstrate +whether messages can be properly produced to and consumed from this cluster. +See Service Overview wiki for how these metrics are derived. + ``` $ ./bin/single-cluster-monitor.sh --topic test --broker-list localhost:9092 --zookeeper localhost:2181 ``` -### Get metric values (e.g. service availability, message loss rate) in real-time as time series graphs -Open ```localhost:8000/index.html``` in your web browser +### Run MultiClusterMonitor app to monitor a pipeline of Kafka clusters connected by MirrorMaker +Edit `config/multi-cluster-monitor.properties` to specify the right broker and +zookeeper url as suggested by the comment in the properties file -You can edit webapp/index.html to easily add new metrics to be displayed. +Metrics `produce-availability-avg` and `consume-availability-avg` demonstrate +whether messages can be properly produced to the source cluster and consumed +from the destination cluster. See config/multi-cluster-monitor.properties for +the full jmx path for these metrics. -### Query metric value (e.g. service availability) via HTTP request ``` -curl localhost:8778/jolokia/read/kmf.services:type=produce-service,name=*/produce-availability-avg +$ ./bin/xinfra-monitor-start.sh config/multi-cluster-monitor.properties ``` -You can query other JMX metric value as well by substituting object-name and -attribute-name of the JMX metric in the query above. - ### Run checkstyle on the java code ``` ./gradlew checkstyleMain checkstyleTest @@ -105,8 +196,6 @@ attribute-name of the JMX metric in the query above. - [Motivation](https://github.com/linkedin/kafka-monitor/wiki/Motivation) - [Design Overview](https://github.com/linkedin/kafka-monitor/wiki/Design-Overview) -- [Service Design](https://github.com/linkedin/kafka-monitor/wiki/Service-Design) +- [Service and App Overview](https://github.com/linkedin/kafka-monitor/wiki) - [Future Work](https://github.com/linkedin/kafka-monitor/wiki/Future-Work) - - - +- [Application Configuration](https://github.com/linkedin/kafka-monitor/wiki/App-Configuration) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..f645bac2 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,17 @@ +# Security Policy + +## Supported Versions + + + +| Version | Supported | +| ------- | ------------------ | +| 1.1.x | :white_check_mark: | + + + +## Reporting a Vulnerability + +Use this section to tell people how to report a vulnerability. + +Please report a vulnerability on issues at https://github.com/linkedin/kafka-monitor/issues/new. diff --git a/bin/kmf-run-class.sh b/bin/kmf-run-class.sh index d694ee0b..0fb1f8a2 100755 --- a/bin/kmf-run-class.sh +++ b/bin/kmf-run-class.sh @@ -50,7 +50,7 @@ fi # Log4j settings if [ -z "$KAFKA_LOG4J_OPTS" ]; then - KAFKA_LOG4J_OPTS="-Dlog4j.configuration=file:$base_dir/config/log4j.properties" + KAFKA_LOG4J_OPTS="-Dlog4j.configurationFile=$base_dir/config/log4j2.properties" fi KAFKA_LOG4J_OPTS="-Dkafka.logs.dir=$LOG_DIR $KAFKA_LOG4J_OPTS" @@ -74,7 +74,7 @@ fi # JVM performance options if [ -z "$KAFKA_JVM_PERFORMANCE_OPTS" ]; then - KAFKA_JVM_PERFORMANCE_OPTS="-server -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -XX:+CMSScavengeBeforeRemark -XX:+DisableExplicitGC -Djava.awt.headless=true" + KAFKA_JVM_PERFORMANCE_OPTS="-server -Djava.awt.headless=true" fi diff --git a/bin/single-cluster-monitor.sh b/bin/single-cluster-monitor.sh index 7c83636f..79f9eacb 100755 --- a/bin/single-cluster-monitor.sh +++ b/bin/single-cluster-monitor.sh @@ -9,4 +9,4 @@ base_dir=$(dirname $0) -exec $base_dir/kmf-run-class.sh com/linkedin/kmf/apps/SingleClusterMonitor $@ +exec $base_dir/kmf-run-class.sh com/linkedin/xinfra/monitor/apps/SingleClusterMonitor $@ diff --git a/bin/windows/kafka-monitor-start.bat b/bin/windows/kafka-monitor-start.bat index abba5640..45eedad7 100644 --- a/bin/windows/kafka-monitor-start.bat +++ b/bin/windows/kafka-monitor-start.bat @@ -15,11 +15,11 @@ popd IF [%1] EQU [] ( - echo USAGE: %0 config/kafka-monitor.properties + echo USAGE: %0 config/xinfra-monitor.properties EXIT /B 1 ) -set COMMAND=%BASE_DIR%\kmf-run-class.bat com.linkedin.kmf.KafkaMonitor %* +set COMMAND=%BASE_DIR%\kmf-run-class.bat com.linkedin.xinfra.monitor.XinfraMonitor %* rem echo basedir: %BASE_DIR% diff --git a/bin/windows/kmf-run-class.bat b/bin/windows/kmf-run-class.bat index 559a965d..caddf261 100644 --- a/bin/windows/kmf-run-class.bat +++ b/bin/windows/kmf-run-class.bat @@ -10,12 +10,12 @@ REM an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expre setlocal enabledelayedexpansion IF [%1] EQU [] ( - echo USAGE: %0 com.linkedin.kmf.KafkaMonitor config/kafka-monitor.properties + echo USAGE: %0 com.linkedin.xinfra.monitor.XinfraMonitor config/xinfra-monitor.properties EXIT /B 1 ) IF [%2] EQU [] ( - echo USAGE: %0 %1 config/kafka-monitor.properties + echo USAGE: %0 %1 config/xinfra-monitor.properties EXIT /B 1 ) @@ -60,7 +60,7 @@ IF ["%LOG_DIR%"] EQU [""] ( rem Log4j settings IF ["%KAFKA_LOG4J_OPTS%"] EQU [""] ( - set KAFKA_LOG4J_OPTS=-Dlog4j.configuration=file:%BASE_DIR%\config\log4j.properties + set KAFKA_LOG4J_OPTS=-Dlog4j.configurationFile=%BASE_DIR%\config\log4j2.properties ) ELSE ( # create logs directory IF not exist %LOG_DIR% ( @@ -170,7 +170,7 @@ REM fi REM Launch mode REM if [ "x$DAEMON_MODE" = "xtrue" ]; then -REM nohup $JAVA $KAFKA_HEAP_OPTS $KAFKA_JVM_PERFORMANCE_OPTS $KAFKA_GC_LOG_OPTS $KAFKA_JMX_OPTS -cp $CLASSPATH REM $KAFKA_OPTS "$@" > "$CONSOLE_OUTPUT_FILE" 2>&1 < /dev/null &REM -REM elseREM +REM nohup $JAVA $KAFKA_HEAP_OPTS $KAFKA_JVM_PERFORMANCE_OPTS $KAFKA_GC_LOG_OPTS $KAFKA_JMX_OPTS -cp $CLASSPATH REM $KAFKA_OPTS "$@" > "$CONSOLE_OUTPUT_FILE" 2>&1 < /dev/null &REM +REM elseREM REM exec $JAVA $KAFKA_HEAP_OPTS $KAFKA_JVM_PERFORMANCE_OPTS $KAFKA_GC_LOG_OPTS $KAFKA_JMX_OPTS REM $KAFKA_LOG4J_OPTS -cp $CLASSPATH $KAFKA_OPTS "$@" REM fi diff --git a/bin/kafka-monitor-start.sh b/bin/xinfra-monitor-start.sh similarity index 86% rename from bin/kafka-monitor-start.sh rename to bin/xinfra-monitor-start.sh index eb4863dc..2a341a16 100755 --- a/bin/kafka-monitor-start.sh +++ b/bin/xinfra-monitor-start.sh @@ -9,4 +9,4 @@ base_dir=$(dirname $0) -exec $base_dir/kmf-run-class.sh com/linkedin/kmf/KafkaMonitor $@ +exec $base_dir/kmf-run-class.sh com/linkedin/xinfra/monitor/XinfraMonitor $@ diff --git a/build.gradle b/build.gradle index 9db41aca..230cba17 100644 --- a/build.gradle +++ b/build.gradle @@ -1,51 +1,134 @@ + +def configDocDir = "${buildDir}/configDocs" + +apply plugin: 'maven-publish' +apply plugin: 'distribution' + + allprojects { apply plugin: 'idea' apply plugin: 'eclipse' apply plugin: 'java' apply plugin: 'checkstyle' - version = "1.0.0" + sourceCompatibility = 8 + targetCompatibility = 8 - sourceCompatibility = 1.7 + group = 'com.linkedin.kmf' repositories { mavenCentral() + maven { + url "https://linkedin.jfrog.io/artifactory/avro-util/" + } } dependencies { compile 'net.sourceforge.argparse4j:argparse4j:0.5.0' - compile 'org.slf4j:slf4j-log4j12:1.7.6' - compile 'org.apache.avro:avro:1.4.0' - compile 'org.apache.kafka:kafka_2.11:0.10.1.1' - compile 'org.apache.kafka:kafka-clients:0.10.1.1' - compile 'org.testng:testng:6.8.8' - compile 'org.eclipse.jetty:jetty-server:8.1.19.v20160209' + compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.17.1' + compile 'org.apache.avro:avro:1.9.2' compile 'org.json:json:20140107' - compile 'com.fasterxml.jackson.core:jackson-databind:2.7.1' - compile 'org.jolokia:jolokia-jvm:1.3.3' + compile 'org.jolokia:jolokia-jvm:1.6.2' compile 'net.savantly:graphite-client:1.1.0-RELEASE' compile 'com.timgroup:java-statsd-client:3.0.1' - + compile 'com.signalfx.public:signalfx-codahale:0.0.47' + compile group: 'org.apache.kafka', name: 'kafka_2.12', version: '2.8.2' + compile group: 'org.apache.kafka', name: 'kafka-clients', version: '2.8.2' + compile 'org.apache.commons:commons-lang3:3.12.0' + compile 'com.linkedin.avroutil1:helper-all:0.2.118' + compile 'org.apache.zookeeper:zookeeper:3.8.0' + testCompile 'org.mockito:mockito-core:2.24.0' testCompile 'org.testng:testng:6.8.8' } tasks.create(name: "copyDependantLibs", type: Copy) { - from (configurations.testRuntime) { + from(configurations.testRuntime) { include('slf4j-log4j12*') } - from (configurations.runtime) { - } + from(configurations.runtime) {} into "build/dependant-libs" duplicatesStrategy 'exclude' } jar { + doFirst { + manifest { + // embed version information into jar manifests + attributes('Name': "${project.name}", + 'Specification-Title': "${project.name}", + 'Specification-Version': "${project.version}", + 'Specification-Vendor': "LinkedIn", + 'Implementation-Title': "${project.name}", + 'Implementation-Version': "${project.version}", + 'Implementation-Vendor': "LinkedIn") + } + } + dependsOn 'copyDependantLibs' } + task sourceJar(type: Jar) { + from sourceSets.main.allJava + classifier "sources" + } + + task javadocJar(type: Jar) { + from javadoc + classifier = 'javadoc' + } + task testJar(type: Jar) { - classifier = 'test' - from sourceSets.test.output + from sourceSets.test.allJava + classifier = 'tests' + } + + publishing { + publications { + MyPublication(MavenPublication) { + groupId project.group + artifactId project.name + version project.version + + from components.java + artifact sourceJar + artifact javadocJar + artifact testJar + artifact distZip + artifact distTar + + pom { + name = 'kafka-monitor' + description = 'kafka monitor' + url = 'https://github.com/linkedin/kafka-monitor' + + licenses { + license { + name = 'The Apache Software License, Version 2.0' + url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } + scm { + connection = 'scm:git:git://github.com:linkedin/kafka-monitor.git' + developerConnection = 'scm:git:ssh://github.com:linkedin/kafka-monitor.git' + url = 'https://github.com/linkedin/kafka-monitor' + } + } + + repositories { + mavenLocal() + maven { + name "LinkedInJfrog" + url "https://linkedin.jfrog.io/artifactory/kafka-monitor" + credentials { + if (System.getenv('JFROG_USER') != null && System.getenv('JFROG_KEY') != null) { + username System.getenv('JFROG_USER') + password System.getenv('JFROG_KEY') + } + } + } + } + } + } } artifacts { @@ -54,8 +137,24 @@ allprojects { checkstyle { configFile = new File(rootDir, "checkstyle/checkstyle.xml") + configProperties = ["suppressionFile": new File(rootDir, "checkstyle/suppressions.xml")] + } + + task createConfigDocs( dependsOn : compileJava, type : JavaExec) { + outputs.dir configDocDir + classpath sourceSets.main.runtimeClasspath + main = 'com.linkedin.xinfra.monitor.common.ConfigDocumentationGenerator' + args = [configDocDir, + 'com.linkedin.xinfra.monitor.services.configs.ConsumeServiceConfig', + 'com.linkedin.xinfra.monitor.services.configs.DefaultMetricsReporterServiceConfig', + 'com.linkedin.xinfra.monitor.services.configs.JettyServiceConfig', + 'com.linkedin.xinfra.monitor.services.configs.ProduceServiceConfig', + 'com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig', + 'com.linkedin.xinfra.monitor.apps.configs.MultiClusterMonitorConfig'] } + build.dependsOn createConfigDocs + test.dependsOn('checkstyleMain', 'checkstyleTest') test { @@ -66,8 +165,43 @@ allprojects { exceptionFormat = 'full' } } + + distributions { + main { + contents { + into('bin') { + from 'bin' + } + into('build/libs') { + from jar + } + into('build/dependant-libs') { + from copyDependantLibs + } + into('config') { + from 'config' + } + into('build/configDocs') { + from createConfigDocs + } + into('webapp') { + from 'webapp' + } + from('.') { + include 'README.md' + } + } + } + } + tasks.withType(Tar){ + compression = Compression.GZIP + extension = 'tar.gz' + } + } -task wrapper(type: Wrapper) { - gradleVersion = '2.11' +wrapper { + gradleVersion = '5.2.1' + distributionType = Wrapper.DistributionType.ALL + } diff --git a/checkstyle/checkstyle.xml b/checkstyle/checkstyle.xml index f7edb531..da75cff1 100644 --- a/checkstyle/checkstyle.xml +++ b/checkstyle/checkstyle.xml @@ -18,11 +18,14 @@ - + - + + + + @@ -32,6 +35,7 @@ + @@ -77,4 +81,6 @@ + + diff --git a/checkstyle/suppressions.xml b/checkstyle/suppressions.xml new file mode 100644 index 00000000..b5062103 --- /dev/null +++ b/checkstyle/suppressions.xml @@ -0,0 +1,9 @@ + + + + + + + \ No newline at end of file diff --git a/config/kafka-monitor.properties b/config/kafka-monitor.properties deleted file mode 100644 index 91779669..00000000 --- a/config/kafka-monitor.properties +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this -# file except in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on -# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# This properties file specifies the tests/services that KafkaMonitor -# should instantiate and run, together with the key/value pairs used to -# configure these tests/services. It should have the following format: -# -# { -# "name1" : { -# "type": TestClassName -# "key1": value1, -# "key2": value2, -# ... -# }, -# "name2" : { -# "type": ServiceClassName -# "key1": value1, -# "key2": value2, -# ... -# }, -# ... -# } -# -# TestClassName can be canonical name or simple name of any class that implements -# interface com.linkedin.kmf.services.Test. These classes should be under -# package com.linkedin.kmf.tests. -# -# ServiceClassName can be canonical name or simple name of any class that implements -# interface com.linkedin.kmf.services.Service. These classes should be under -# package com.linkedin.kmf.services. -# -# Each test/service should be configured with class.name which can be either TestClassName -# or ServiceClassName. The key for the test/service in the json map is used as name to -# identify the test/service in the log or JMX metrics, which is useful if multiple -# test/service with the same class.name are run in the same Kafka Monitor process. -# - -{ - "single-cluster-monitor": { - "class.name": "com.linkedin.kmf.apps.SingleClusterMonitor", - "topic": "kafka-monitor-topic", - "zookeeper.connect": "localhost:2181", - "bootstrap.servers": "localhost:9092", - "produce.record.delay.ms": 100, - "topic-management.topicCreationEnabled": true, - "topic-management.replicationFactor" : 1, - "topic-management.partitionsToBrokersRatio" : 2.0, - "topic-management.rebalance.interval.ms" : 600000, - "topic-management.topicFactory.props": { - }, - "produce.producer.props": { - "client.id": "kmf-client-id" - }, - - "consume.latency.sla.ms": "20000", - "consume.consumer.props": { - - } - - }, - - "reporter-service": { - "class.name": "com.linkedin.kmf.services.DefaultMetricsReporterService", - "report.interval.sec": 1, - "report.metrics.list": [ - "kmf:type=kafka-monitor:offline-runnable-count", - "kmf.services:type=produce-service,name=*:produce-availability-avg", - "kmf.services:type=consume-service,name=*:consume-availability-avg", - "kmf.services:type=produce-service,name=*:records-produced-total", - "kmf.services:type=consume-service,name=*:records-consumed-total", - "kmf.services:type=consume-service,name=*:records-lost-total", - "kmf.services:type=consume-service,name=*:records-duplicated-total", - "kmf.services:type=consume-service,name=*:records-delay-ms-avg", - "kmf.services:type=produce-service,name=*:records-produced-rate", - "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate" - ] - }, - - "jetty-service": { - "class.name": "com.linkedin.kmf.services.JettyService", - "jetty.port": 8000 - }, - - "jolokia-service": { - "class.name": "com.linkedin.kmf.services.JolokiaService" - } -} - - diff --git a/config/log4j.properties b/config/log4j.properties deleted file mode 100644 index 8d4b79a9..00000000 --- a/config/log4j.properties +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this -# file except in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on -# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -log4j.rootLogger=INFO, stdout - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n - -log4j.appender.kafkaClientAppender=org.apache.log4j.DailyRollingFileAppender -log4j.appender.kafkaClientAppender.DatePattern='.'yyyy-MM-dd-HH -log4j.appender.kafkaClientAppender.File=${kafka.logs.dir}/kafka-client.log -log4j.appender.kafkaClientAppender.layout=org.apache.log4j.PatternLayout -log4j.appender.kafkaClientAppender.layout.ConversionPattern=[%d] %p %m (%c)%n - -log4j.logger.com.linkedin.kmf.core.KafkaMonitor=INFO, stdout -log4j.additivity.com.linkedin.kmf.core.KafkaMonitor=false - -log4j.logger.org.apache.kafka=WARN, kafkaClientAppender -log4j.additivity.org.apache.kafka=false - -log4j.logger.kafka=WARN, kafkaClientAppender -log4j.additivity.kafka=false - diff --git a/config/log4j2.properties b/config/log4j2.properties new file mode 100644 index 00000000..4896697d --- /dev/null +++ b/config/log4j2.properties @@ -0,0 +1,47 @@ +# Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +status = error +dest = err +name = PropertiesConfig + +filter.threshold.type = ThresholdFilter +filter.threshold.level = debug + +appender.console.type = Console +appender.console.name = STDOUT +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = [%d] %p %m (%c)%n + +appender.kafka.type = RollingFile +appender.kafka.name = KAFKA +appender.kafka.filename = ${sys:kafka.logs.dir}/kafka-client.log +appender.kafka.filePattern = ${sys:kafka.logs.dir}/kafka-client.log.%d{yyyy-MM-dd-HH} +appender.kafka.layout.type = PatternLayout +appender.kafka.layout.pattern = [%d] %p %m (%c)%n +appender.kafka.policies.type = Policies +appender.kafka.policies.time.type = TimeBasedTriggeringPolicy + +# Modify this as needed when working on dev box. Trace -> Debug -> Info -> Warn -> Error -> Fatal +rootLogger.level = info +rootLogger.appenderRef.console.ref = STDOUT + +logger.kmf.name = com.linkedin.kmf.core.KafkaMonitor +logger.kmf.level = info +logger.kmf.additivity = false +logger.kmf.appenderRef.console.ref = STDOUT + +logger.kafkaClient.name = org.apache.kafka +logger.kafkaClient.level = warn +logger.kafkaClient.additivity = false +logger.kafkaClient.appenderRef.kafka.ref = KAFKA + +logger.kafka.name = kafka +logger.kafka.level = warn +logger.kafka.additivity = false +logger.kafka.appenderRef.kafka.ref = KAFKA diff --git a/config/multi-cluster-monitor.properties b/config/multi-cluster-monitor.properties index 6c35ec19..dd40b035 100644 --- a/config/multi-cluster-monitor.properties +++ b/config/multi-cluster-monitor.properties @@ -12,22 +12,23 @@ # each cluster in the pipeline. The "produce.service.props" should use the first cluster and # the "consume.service.props" should use the last cluster in the pipeline. +# Produce service: Configure Produce Service to produce to the first cluster of the pipeline +# Consume service: Configure Consume Service to consume from the last cluster of the pipeline +# Last cluster: If there are more than two clusters in the pipeline, add one property map for each one of them. { "multi-cluster-monitor": { "class.name": "com.linkedin.kmf.apps.MultiClusterMonitor", "topic": "kafka-monitor-topic", - "produce.service.props": { - "zookeeper.connect": "localhost:2181/cluster1", + "zookeeper.connect": "localhost:2181/first_cluster", "bootstrap.servers": "localhost:9092", "produce.record.delay.ms": 100, "produce.producer.props": { "client.id": "kafka-monitor-client-id" } }, - "consume.service.props": { - "zookeeper.connect": "localhost:2181/cluster2", + "zookeeper.connect": "localhost:2181/last_cluster", "bootstrap.servers": "localhost:9095", "consume.latency.sla.ms": "20000", "consume.consumer.props": { @@ -37,7 +38,8 @@ "topic.management.props.per.cluster" : { "first-cluster" : { - "zookeeper.connect": "localhost:2181/cluster1", + "bootstrap.servers": "localhost:9092", + "zookeeper.connect": "localhost:2181/first_cluster", "topic-management.topicCreationEnabled": true, "topic-management.replicationFactor" : 1, "topic-management.partitionsToBrokersRatio" : 2.0, @@ -47,7 +49,8 @@ }, "last-cluster" : { - "zookeeper.connect": "localhost:2181/cluster2", + "bootstrap.servers": "localhost:9095", + "zookeeper.connect": "localhost:2181/last_cluster", "topic-management.topicCreationEnabled": true, "topic-management.replicationFactor" : 1, "topic-management.partitionsToBrokersRatio" : 2.0, @@ -68,22 +71,22 @@ "kmf.services:type=produce-service,name=*:records-produced-total", "kmf.services:type=consume-service,name=*:records-consumed-total", "kmf.services:type=consume-service,name=*:records-lost-total", + "kmf.services:type=consume-service,name=*:records-lost-rate", "kmf.services:type=consume-service,name=*:records-duplicated-total", "kmf.services:type=consume-service,name=*:records-delay-ms-avg", "kmf.services:type=produce-service,name=*:records-produced-rate", "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate" + "kmf.services:type=consume-service,name=*:consume-error-rate", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg", + "kmf.services:type=commit-availability-service,name=*:commit-latency-avg", + "kmf.services:type=commit-availability-service,name=*:commit-availability-avg", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-total", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total" ] }, - "jetty-service": { - "class.name": "com.linkedin.kmf.services.JettyService", - "jetty.port": 8000 - }, - "jolokia-service": { "class.name": "com.linkedin.kmf.services.JolokiaService" } } - - diff --git a/config/prometheus-exporter.yaml b/config/prometheus-exporter.yaml new file mode 100644 index 00000000..7401e3a4 --- /dev/null +++ b/config/prometheus-exporter.yaml @@ -0,0 +1,7 @@ +--- +lowercaseOutputName: true +rules: +- pattern : kmf<>([\w\d-]+) + name: kmf_$1_$2 +- pattern : kmf.services<>([\w\d-]+) + name: kmf_services_$1_$2_$3 diff --git a/config/xinfra-monitor.properties b/config/xinfra-monitor.properties new file mode 100644 index 00000000..6993bf47 --- /dev/null +++ b/config/xinfra-monitor.properties @@ -0,0 +1,197 @@ +# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +# This properties file specifies the tests/services that XinfraMonitor +# should instantiate and run, together with the key/value pairs used to +# configure these tests/services. It should have the following format: +# +# { +# "name1" : { +# "type": TestClassName +# "key1": value1, +# "key2": value2, +# ... +# }, +# "name2" : { +# "type": ServiceClassName +# "key1": value1, +# "key2": value2, +# ... +# }, +# ... +# } +# +# TestClassName can be canonical name or simple name of any class that implements +# interface com.linkedin.kmf.services.Test. These classes should be under +# package com.linkedin.kmf.tests. +# +# ServiceClassName can be canonical name or simple name of any class that implements +# interface com.linkedin.kmf.services.Service. These classes should be under +# package com.linkedin.kmf.services. +# +# Each test/service should be configured with class.name which can be either TestClassName +# or ServiceClassName. The key for the test/service in the json map is used as name to +# identify the test/service in the log or JMX metrics, which is useful if multiple +# test/service with the same class.name are run in the same Kafka Monitor process. +# +# If using Secure Socket Layer for security protocol, SSL properties must be defined under +# produce.producer.props, consume.consumer.props, as well as single-cluster-monitor props + +{ + "single-cluster-monitor": { + "class.name": "com.linkedin.xinfra.monitor.apps.SingleClusterMonitor", + "topic": "xinfra-monitor-topic", + "zookeeper.connect": "localhost:2181", + "bootstrap.servers": "localhost:9092,localhost:9093", + "request.timeout.ms": 9000, + "produce.record.delay.ms": 100, + "topic-management.topicManagementEnabled": true, + "topic-management.topicCreationEnabled": true, + "topic-management.replicationFactor" : 1, + "topic-management.partitionsToBrokersRatio" : 2.0, + "topic-management.rebalance.interval.ms" : 600000, + "topic-management.preferred.leader.election.check.interval.ms" : 300000, + "topic-management.topicFactory.props": { + }, + "topic-management.topic.props": { + "retention.ms": "3600000" + }, + "produce.producer.props": { + "client.id": "kmf-client-id" + }, + + "consume.latency.sla.ms": "20000", + "consume.consumer.props": { + } + }, + + "offset-commit-service": { + "class.name": "com.linkedin.xinfra.monitor.services.OffsetCommitService", + "zookeeper.connect": "localhost:2181", + "bootstrap.servers": "localhost:9092,localhost:9093", + "consumer.props": { + "group.id": "target-consumer-group" + } + }, + + "jolokia-service": { + "class.name": "com.linkedin.xinfra.monitor.services.JolokiaService" + }, + + "reporter-service": { + "class.name": "com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService", + "report.interval.sec": 1, + "report.metrics.list": [ + "kmf:type=kafka-monitor:offline-runnable-count", + "kmf.services:type=produce-service,name=*:produce-availability-avg", + "kmf.services:type=consume-service,name=*:consume-availability-avg", + "kmf.services:type=produce-service,name=*:records-produced-total", + "kmf.services:type=consume-service,name=*:records-consumed-total", + "kmf.services:type=produce-service,name=*:records-produced-rate", + "kmf.services:type=produce-service,name=*:produce-error-rate", + "kmf.services:type=consume-service,name=*:consume-error-rate", + "kmf.services:type=consume-service,name=*:records-lost-total", + "kmf.services:type=consume-service,name=*:records-lost-rate", + "kmf.services:type=consume-service,name=*:records-duplicated-total", + "kmf.services:type=consume-service,name=*:records-delay-ms-avg", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-total", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th", + "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-avg", + "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-max", + "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-avg", + "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-max", + "kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total" + ] + }, + + "cluster-topic-manipulation-service":{ + "class.name":"com.linkedin.xinfra.monitor.services.ClusterTopicManipulationService", + "zookeeper.connect": "localhost:2181", + "bootstrap.servers":"localhost:9092,localhost:9093", + "topic": "xinfra-monitor-topic" + }, + +# Example produce-service to produce messages to cluster +# "produce-service": { +# "class.name": "com.linkedin.kmf.services.ProduceService", +# "topic": "xinfra-monitor-topic", +# "zookeeper.connect": "localhost:2181", +# "bootstrap.servers": "localhost:9092", +# "consume.latency.sla.ms": "20000", +# "consume.consumer.props": { +# } +# }, + +# Example consume-service to consume messages +# "consume-service": { +# "class.name": "com.linkedin.kmf.services.ConsumeService", +# "topic": "xinfra-monitor-topic", +# "zookeeper.connect": "localhost:2181", +# "bootstrap.servers": "localhost:9092", +# "consume.latency.sla.ms": "20000", +# "consume.consumer.props": { +# } +# }, + +# Example statsd-service to report metrics +# "statsd-service": { +# "class.name": "com.linkedin.xinfra.monitor.services.StatsdMetricsReporterService", +# "report.statsd.host": "localhost", +# "report.statsd.port": "8125", +# "report.statsd.prefix": "xinfra-monitor", +# "report.interval.sec": 1, +# "report.metrics.list": [ +# "kmf.services:type=produce-service,name=*:produce-availability-avg", +# "kmf.services:type=consume-service,name=*:consume-availability-avg" +# ] +# }, + +# Example kafka-service to report metrics + "reporter-kafka-service": { + "class.name": "com.linkedin.xinfra.monitor.services.KafkaMetricsReporterService", + "report.interval.sec": 3, + "zookeeper.connect": "localhost:2181", + "bootstrap.servers": "localhost:9092", + "topic": "xinfra-monitor-topic-metrics", + "report.kafka.topic.replication.factor": 1, + "report.metrics.list": [ + "kmf.services:type=produce-service,name=*:produce-availability-avg", + "kmf.services:type=consume-service,name=*:consume-availability-avg", + "kmf.services:type=produce-service,name=*:records-produced-total", + "kmf.services:type=consume-service,name=*:records-consumed-total", + "kmf.services:type=consume-service,name=*:records-lost-total", + "kmf.services:type=consume-service,name=*:records-duplicated-total", + "kmf.services:type=consume-service,name=*:records-delay-ms-avg", + "kmf.services:type=produce-service,name=*:records-produced-rate", + "kmf.services:type=produce-service,name=*:produce-error-rate", + "kmf.services:type=consume-service,name=*:consume-error-rate" + ] + } + +# Example signalfx-service to report metrics +# "signalfx-service": { +# "class.name": "com.linkedin.kmf.services.SignalFxMetricsReporterService", +# "report.interval.sec": 1, +# "report.metric.dimensions": { +# }, +# "report.signalfx.url": "", +# "report.signalfx.token" : "" +# } + +} diff --git a/docker/Dockerfile b/docker/Dockerfile index fcd49e39..baff1023 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,14 +17,11 @@ MAINTAINER coffeepac@gmail.com WORKDIR /opt/kafka-monitor ADD build/ build/ -ADD core/build/ core/build/ -ADD services/build/ services/build/ -ADD tests/build/ tests/build/ -ADD bin/kafka-monitor-start.sh bin/kafka-monitor-start.sh +ADD bin/xinfra-monitor-start.sh bin/xinfra-monitor-start.sh ADD bin/kmf-run-class.sh bin/kmf-run-class.sh -ADD config/kafka-monitor.properties config/kafka-monitor.properties -ADD config/log4j.properties config/log4j.properties +ADD config/xinfra-monitor.properties config/xinfra-monitor.properties +ADD config/log4j2.properties config/log4j2.properties ADD docker/kafka-monitor-docker-entry.sh kafka-monitor-docker-entry.sh ADD webapp/ webapp/ -CMD ["/opt/kafka-monitor/kafka-monitor-docker-entry.sh"] \ No newline at end of file +CMD ["/opt/kafka-monitor/kafka-monitor-docker-entry.sh"] diff --git a/docker/kafka-monitor-docker-entry.sh b/docker/kafka-monitor-docker-entry.sh index 3abc003b..97554bb0 100755 --- a/docker/kafka-monitor-docker-entry.sh +++ b/docker/kafka-monitor-docker-entry.sh @@ -15,7 +15,13 @@ set -x +# SIGTERM-handler +trap 'pkill java; exit 130' SIGINT +trap 'pkill java; exit 143' SIGTERM + # wait for DNS services to be available sleep 10 -bin/kafka-monitor-start.sh config/kafka-monitor.properties +bin/xinfra-monitor-start.sh config/xinfra-monitor.properties & + +wait $! \ No newline at end of file diff --git a/docs/images/xinfra_monitor.png b/docs/images/xinfra_monitor.png new file mode 100644 index 00000000..d0dfce61 Binary files /dev/null and b/docs/images/xinfra_monitor.png differ diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 05ef575b..51288f9c 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index c0abcf1d..842c8c5a 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ -#Tue Dec 06 22:38:25 EST 2016 +#Mon Apr 01 18:19:43 PDT 2019 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip diff --git a/gradlew b/gradlew index 9d82f789..2477741a 100755 --- a/gradlew +++ b/gradlew @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh ############################################################################## ## @@ -6,12 +6,30 @@ ## ############################################################################## -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null APP_NAME="Gradle" APP_BASE_NAME=`basename "$0"` +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD="maximum" @@ -30,6 +48,7 @@ die ( ) { cygwin=false msys=false darwin=false +nonstop=false case "`uname`" in CYGWIN* ) cygwin=true @@ -40,26 +59,11 @@ case "`uname`" in MINGW* ) msys=true ;; + NONSTOP* ) + nonstop=true + ;; esac -# Attempt to set APP_HOME -# Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi -done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null - CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar # Determine the Java command to use to start the JVM. @@ -85,7 +89,7 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then MAX_FD_LIMIT=`ulimit -H -n` if [ $? -eq 0 ] ; then if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then @@ -100,18 +104,18 @@ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then fi fi -# For Darwin, add options to specify how the application appears in the dock +# For Darwin, add options to specify how the application appears in the dock. if $darwin; then GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" fi -# For Cygwin, switch paths to Windows format before running java +# For Cygwin, switch paths to Windows format before running java. if $cygwin ; then APP_HOME=`cygpath --path --mixed "$APP_HOME"` CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` JAVACMD=`cygpath --unix "$JAVACMD"` - # We build the pattern for arguments to be converted via cygpath + # We build the pattern for arguments to be converted via cygpath. ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` SEP="" for dir in $ROOTDIRSRAW ; do @@ -119,7 +123,7 @@ if $cygwin ; then SEP="|" done OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments + # Add a user-defined pattern to the cygpath arguments. if [ "$GRADLE_CYGPATTERN" != "" ] ; then OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" fi @@ -150,11 +154,19 @@ if $cygwin ; then esac fi -# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules -function splitJvmOpts() { - JVM_OPTS=("$@") +# Escape application args +save ( ) { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " } -eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS -JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" +APP_ARGS=$(save "$@") + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi -exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat index aec99730..e95643d6 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -8,14 +8,14 @@ @rem Set local scope for the variables with windows NT shell if "%OS%"=="Windows_NT" setlocal -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - set DIRNAME=%~dp0 if "%DIRNAME%" == "" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + @rem Find java.exe if defined JAVA_HOME goto findJavaFromJavaHome @@ -46,10 +46,9 @@ echo location of your Java installation. goto fail :init -@rem Get command-line arguments, handling Windowz variants +@rem Get command-line arguments, handling Windows variants if not "%OS%" == "Windows_NT" goto win9xME_args -if "%@eval[2+2]" == "4" goto 4NT_args :win9xME_args @rem Slurp the command line arguments. @@ -60,11 +59,6 @@ set _SKIP=2 if "x%~1" == "x" goto execute set CMD_LINE_ARGS=%* -goto execute - -:4NT_args -@rem Get arguments from the 4NT Shell from JP Software -set CMD_LINE_ARGS=%$ :execute @rem Setup the command line diff --git a/scripts/publishToJfrog.sh b/scripts/publishToJfrog.sh new file mode 100755 index 00000000..cc12364d --- /dev/null +++ b/scripts/publishToJfrog.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +result=${PWD##*/} +if [[ "$result" = "scripts" ]] +then + echo "script must be run from root project folder, not $PWD" + exit 1 +else + echo "we are in $PWD and tag is $RELEASE_TAG" + + if [[ $RELEASE_TAG =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] + then + echo "publishing: tag $RELEASE_TAG looks like a semver" + git status + git describe --tags + ./gradlew printVersion + ./gradlew publishMyPublicationPublicationToLinkedInJfrogRepository + else + echo "not publishing: tag $RELEASE_TAG is NOT a valid semantic version (x.y.z)" + fi +fi diff --git a/semantic-build-versioning.gradle b/semantic-build-versioning.gradle new file mode 100644 index 00000000..bee379f7 --- /dev/null +++ b/semantic-build-versioning.gradle @@ -0,0 +1,2 @@ + +/* This is used by vivin:gradle-semantic-build-versioning plugin to generate versioned jar files. */ diff --git a/settings.gradle b/settings.gradle index e69de29b..f37e0228 100644 --- a/settings.gradle +++ b/settings.gradle @@ -0,0 +1,15 @@ +buildscript { + repositories { + maven { + url 'https://plugins.gradle.org/m2/' + } + } + dependencies { + classpath 'gradle.plugin.net.vivin:gradle-semantic-build-versioning:4.0.0' + } +} + +apply plugin: 'net.vivin.gradle-semantic-build-versioning' + +// otherwise it defaults to the folder name +rootProject.name = 'kafka-monitor' diff --git a/src/main/java/com/linkedin/kmf/common/Utils.java b/src/main/java/com/linkedin/kmf/common/Utils.java deleted file mode 100644 index e1c8494b..00000000 --- a/src/main/java/com/linkedin/kmf/common/Utils.java +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.common; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.lang.management.ManagementFactory; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Properties; -import java.util.Set; - -import kafka.admin.AdminUtils; -import kafka.admin.RackAwareMode; -import kafka.server.KafkaConfig; -import kafka.utils.ZkUtils; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.Encoder; -import org.apache.avro.io.JsonEncoder; -import org.apache.kafka.common.errors.TopicExistsException; -import org.apache.kafka.common.security.JaasUtils; -import org.json.JSONObject; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.collection.Seq; - -import javax.management.MBeanAttributeInfo; -import javax.management.MBeanInfo; -import javax.management.MBeanServer; -import javax.management.ObjectName; - - -/** - * Kafka monitoring utilities. - */ -public class Utils { - private static final Logger LOG = LoggerFactory.getLogger(Utils.class); - - public static final int ZK_CONNECTION_TIMEOUT_MS = 30_000; - public static final int ZK_SESSION_TIMEOUT_MS = 30_000; - - /** - * Read number of partitions for the given topic on the specified zookeeper - * @param zkUrl zookeeper connection url - * @param topic topic name - * - * @return the number of partitions of the given topic - */ - public static int getPartitionNumForTopic(String zkUrl, String topic) { - ZkUtils zkUtils = ZkUtils.apply(zkUrl, ZK_SESSION_TIMEOUT_MS, ZK_CONNECTION_TIMEOUT_MS, JaasUtils.isZkSecurityEnabled()); - try { - Seq topics = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(topic)); - return zkUtils.getPartitionsForTopics(topics).apply(topic).size(); - } catch (NoSuchElementException e) { - return 0; - } finally { - zkUtils.close(); - } - } - - /** - * Create the topic that the monitor uses to monitor the cluster. This method attempts to create a topic so that all - * the brokers in the cluster will have partitionToBrokerRatio partitions. If the topic exists, but has different parameters - * then this does nothing to update the parameters. - * - * TODO: Do we care about rack aware mode? I would think no because we want to spread the topic over all brokers. - * @param zkUrl zookeeper connection url - * @param topic topic name - * @param replicationFactor the replication factor for the topic - * @param partitionToBrokerRatio This is multiplied by the number brokers to compute the number of partitions in the topic. - * @param topicConfig additional parameters for the topic for example min.insync.replicas - * @return the number of partitions created - */ - public static int createMonitoringTopicIfNotExists(String zkUrl, String topic, int replicationFactor, - double partitionToBrokerRatio, Properties topicConfig) { - ZkUtils zkUtils = ZkUtils.apply(zkUrl, ZK_SESSION_TIMEOUT_MS, ZK_CONNECTION_TIMEOUT_MS, JaasUtils.isZkSecurityEnabled()); - try { - if (AdminUtils.topicExists(zkUtils, topic)) { - return getPartitionNumForTopic(zkUrl, topic); - } - - int brokerCount = zkUtils.getAllBrokersInCluster().size(); - - int partitionCount = (int) Math.ceil(brokerCount * partitionToBrokerRatio); - - int defaultMinIsr = Math.max(replicationFactor - 1, 1); - if (!topicConfig.containsKey(KafkaConfig.MinInSyncReplicasProp())) { - topicConfig.setProperty(KafkaConfig.MinInSyncReplicasProp(), Integer.toString(defaultMinIsr)); - } - - try { - AdminUtils.createTopic(zkUtils, topic, partitionCount, replicationFactor, topicConfig, RackAwareMode.Enforced$.MODULE$); - } catch (TopicExistsException e) { - //There is a race condition with the consumer. - LOG.debug("Monitoring topic " + topic + " already exists in cluster " + zkUrl, e); - return getPartitionNumForTopic(zkUrl, topic); - } - LOG.info("Created monitoring topic " + topic + " in cluster " + zkUrl + " with " + partitionCount + " partitions, min ISR of " - + topicConfig.get(KafkaConfig.MinInSyncReplicasProp()) + " and replication factor of " + replicationFactor + "."); - - return partitionCount; - } finally { - zkUtils.close(); - } - } - - /** - * @param zkUrl zookeeper connection url - * @return number of brokers in this cluster - */ - public static int getBrokerCount(String zkUrl) { - ZkUtils zkUtils = ZkUtils.apply(zkUrl, ZK_SESSION_TIMEOUT_MS, ZK_CONNECTION_TIMEOUT_MS, JaasUtils.isZkSecurityEnabled()); - try { - return zkUtils.getAllBrokersInCluster().size(); - } finally { - zkUtils.close(); - } - } - - /** - * @param timestamp time in Ms when this message is generated - * @param topic topic this message is sent to - * @param idx index is consecutive numbers used by KafkaMonitor to determine duplicate or lost messages - * @param msgSize size of the message - * @return string that encodes the above fields - */ - public static String jsonFromFields(String topic, long idx, long timestamp, String producerId, int msgSize) { - GenericRecord record = new GenericData.Record(DefaultTopicSchema.MESSAGE_V0); - record.put(DefaultTopicSchema.TOPIC_FIELD.name(), topic); - record.put(DefaultTopicSchema.INDEX_FIELD.name(), idx); - record.put(DefaultTopicSchema.TIME_FIELD.name(), timestamp); - record.put(DefaultTopicSchema.PRODUCER_ID_FIELD.name(), producerId); - // CONTENT_FIELD is composed of #msgSize number of character 'x', e.g. xxxxxxxxxx - record.put(DefaultTopicSchema.CONTENT_FIELD.name(), String.format("%1$-" + msgSize + "s", "").replace(' ', 'x')); - return jsonFromGenericRecord(record); - } - - /** - * @param message kafka message in the string format - * @return GenericRecord that is deserialized from kafka message w.r.t. expected schema - */ - public static GenericRecord genericRecordFromJson(String message) { - GenericRecord record = new GenericData.Record(DefaultTopicSchema.MESSAGE_V0); - JSONObject jsonObject = new JSONObject(message); - record.put(DefaultTopicSchema.TOPIC_FIELD.name(), jsonObject.getString(DefaultTopicSchema.TOPIC_FIELD.name())); - record.put(DefaultTopicSchema.INDEX_FIELD.name(), jsonObject.getLong(DefaultTopicSchema.INDEX_FIELD.name())); - record.put(DefaultTopicSchema.TIME_FIELD.name(), jsonObject.getLong(DefaultTopicSchema.TIME_FIELD.name())); - record.put(DefaultTopicSchema.PRODUCER_ID_FIELD.name(), jsonObject.getString(DefaultTopicSchema.PRODUCER_ID_FIELD.name())); - record.put(DefaultTopicSchema.CONTENT_FIELD.name(), jsonObject.getString(DefaultTopicSchema.CONTENT_FIELD.name())); - return record; - } - - public static String jsonFromGenericRecord(GenericRecord record) { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - GenericDatumWriter writer = new GenericDatumWriter<>(DefaultTopicSchema.MESSAGE_V0); - - try { - Encoder encoder = new JsonEncoder(DefaultTopicSchema.MESSAGE_V0, out); - writer.write(record, encoder); - encoder.flush(); - } catch (IOException e) { - LOG.error("Unable to serialize avro record due to error " + e); - } - return out.toString(); - } - - public static List getMBeanAttributeValues(String mbeanExpr, String attributeExpr) { - List values = new ArrayList<>(); - MBeanServer server = ManagementFactory.getPlatformMBeanServer(); - try { - Set mbeanNames = server.queryNames(new ObjectName(mbeanExpr), null); - for (ObjectName mbeanName: mbeanNames) { - MBeanInfo mBeanInfo = server.getMBeanInfo(mbeanName); - MBeanAttributeInfo[] attributeInfos = mBeanInfo.getAttributes(); - for (MBeanAttributeInfo attributeInfo: attributeInfos) { - if (attributeInfo.getName().equals(attributeExpr) || attributeExpr.length() == 0 || attributeExpr.equals("*")) { - double value = (Double) server.getAttribute(mbeanName, attributeInfo.getName()); - values.add(new MbeanAttributeValue(mbeanName.getCanonicalName(), attributeInfo.getName(), value)); - } - } - } - } catch (Exception e) { - LOG.error("fail to retrieve value for " + mbeanExpr + ":" + attributeExpr, e); - } - return values; - } - -} diff --git a/src/main/java/com/linkedin/kmf/consumer/NewConsumer.java b/src/main/java/com/linkedin/kmf/consumer/NewConsumer.java deleted file mode 100644 index 0d827606..00000000 --- a/src/main/java/com/linkedin/kmf/consumer/NewConsumer.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.consumer; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import java.util.Arrays; -import java.util.Iterator; -import java.util.Properties; - -/* - * Wrap around the new consumer from Apache Kafka and implement the #KMBaseConsumer interface - */ -public class NewConsumer implements KMBaseConsumer { - - private final KafkaConsumer _consumer; - private Iterator> _recordIter; - - public NewConsumer(String topic, Properties consumerProperties) { - _consumer = new KafkaConsumer<>(consumerProperties); - _consumer.subscribe(Arrays.asList(topic)); - } - - @Override - public BaseConsumerRecord receive() { - if (_recordIter == null || !_recordIter.hasNext()) - _recordIter = _consumer.poll(Long.MAX_VALUE).iterator(); - - ConsumerRecord record = _recordIter.next(); - return new BaseConsumerRecord(record.topic(), record.partition(), record.offset(), record.key(), record.value()); - } - - @Override - public void close() { - _consumer.close(); - } - -} diff --git a/src/main/java/com/linkedin/kmf/consumer/OldConsumer.java b/src/main/java/com/linkedin/kmf/consumer/OldConsumer.java deleted file mode 100644 index fdd842cc..00000000 --- a/src/main/java/com/linkedin/kmf/consumer/OldConsumer.java +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.consumer; - -import kafka.consumer.Consumer; -import kafka.consumer.ConsumerConfig; -import kafka.javaapi.consumer.ConsumerConnector; -import kafka.consumer.ConsumerIterator; -import kafka.consumer.KafkaStream; -import kafka.message.MessageAndMetadata; -import kafka.serializer.StringDecoder; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; - -/* - * Wrap around the old consumer from Apache Kafka and implement the #KMBaseConsumer interface - */ -public class OldConsumer implements KMBaseConsumer { - - private final ConsumerConnector _connector; - private final ConsumerIterator _iter; - - public OldConsumer(String topic, Properties consumerProperties) { - _connector = Consumer.createJavaConsumerConnector(new ConsumerConfig(consumerProperties)); - Map topicCountMap = new HashMap<>(); - topicCountMap.put(topic, 1); - Map>> kafkaStreams = _connector.createMessageStreams(topicCountMap, new StringDecoder(null), new StringDecoder(null)); - _iter = kafkaStreams.get(topic).get(0).iterator(); - } - - @Override - public BaseConsumerRecord receive() { - if (!_iter.hasNext()) - return null; - MessageAndMetadata record = _iter.next(); - return new BaseConsumerRecord(record.topic(), record.partition(), record.offset(), record.key(), record.message()); - } - - @Override - public void close() { - _connector.shutdown(); - } - -} diff --git a/src/main/java/com/linkedin/kmf/partitioner/OldKMPartitioner.java b/src/main/java/com/linkedin/kmf/partitioner/OldKMPartitioner.java deleted file mode 100644 index fa356eff..00000000 --- a/src/main/java/com/linkedin/kmf/partitioner/OldKMPartitioner.java +++ /dev/null @@ -1,17 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.partitioner; - -public class OldKMPartitioner implements KMPartitioner { - - public int partition(String key, int partitionNum) { - return Math.abs(key.hashCode()) % partitionNum; - } -} diff --git a/src/main/java/com/linkedin/kmf/services/ConsumeService.java b/src/main/java/com/linkedin/kmf/services/ConsumeService.java deleted file mode 100644 index e2d07f77..00000000 --- a/src/main/java/com/linkedin/kmf/services/ConsumeService.java +++ /dev/null @@ -1,288 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.services; - -import com.linkedin.kmf.common.DefaultTopicSchema; -import com.linkedin.kmf.common.Utils; -import com.linkedin.kmf.consumer.BaseConsumerRecord; -import com.linkedin.kmf.consumer.KMBaseConsumer; -import com.linkedin.kmf.consumer.NewConsumer; -import com.linkedin.kmf.consumer.OldConsumer; -import com.linkedin.kmf.services.configs.ConsumeServiceConfig; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Random; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.avro.generic.GenericRecord; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.common.MetricName; -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.common.metrics.JmxReporter; -import org.apache.kafka.common.metrics.Measurable; -import org.apache.kafka.common.metrics.MetricConfig; -import org.apache.kafka.common.metrics.Metrics; -import org.apache.kafka.common.metrics.MetricsReporter; -import org.apache.kafka.common.metrics.Sensor; -import org.apache.kafka.common.metrics.stats.Avg; -import org.apache.kafka.common.metrics.stats.Max; -import org.apache.kafka.common.metrics.stats.Percentile; -import org.apache.kafka.common.metrics.stats.Percentiles; -import org.apache.kafka.common.metrics.stats.Rate; -import org.apache.kafka.common.metrics.stats.Total; -import org.apache.kafka.common.serialization.StringDeserializer; -import org.apache.kafka.common.utils.SystemTime; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class ConsumeService implements Service { - private static final Logger LOG = LoggerFactory.getLogger(ConsumeService.class); - private static final String METRIC_GROUP_NAME = "consume-service"; - private static final String[] NONOVERRIDABLE_PROPERTIES = - new String[] {ConsumeServiceConfig.BOOTSTRAP_SERVERS_CONFIG, - ConsumeServiceConfig.ZOOKEEPER_CONNECT_CONFIG}; - - private final String _name; - private final ConsumeMetrics _sensors; - private final KMBaseConsumer _consumer; - private final Thread _thread; - private final int _latencyPercentileMaxMs; - private final int _latencyPercentileGranularityMs; - private final AtomicBoolean _running; - private final int _latencySlaMs; - - public ConsumeService(Map props, String name) throws Exception { - _name = name; - Map consumerPropsOverride = props.containsKey(ConsumeServiceConfig.CONSUMER_PROPS_CONFIG) - ? (Map) props.get(ConsumeServiceConfig.CONSUMER_PROPS_CONFIG) : new HashMap<>(); - ConsumeServiceConfig config = new ConsumeServiceConfig(props); - String topic = config.getString(ConsumeServiceConfig.TOPIC_CONFIG); - String zkConnect = config.getString(ConsumeServiceConfig.ZOOKEEPER_CONNECT_CONFIG); - String brokerList = config.getString(ConsumeServiceConfig.BOOTSTRAP_SERVERS_CONFIG); - String consumerClassName = config.getString(ConsumeServiceConfig.CONSUMER_CLASS_CONFIG); - _latencySlaMs = config.getInt(ConsumeServiceConfig.LATENCY_SLA_MS_CONFIG); - _latencyPercentileMaxMs = config.getInt(ConsumeServiceConfig.LATENCY_PERCENTILE_MAX_MS_CONFIG); - _latencyPercentileGranularityMs = config.getInt(ConsumeServiceConfig.LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG); - _running = new AtomicBoolean(false); - - for (String property: NONOVERRIDABLE_PROPERTIES) { - if (consumerPropsOverride.containsKey(property)) { - throw new ConfigException("Override must not contain " + property + " config."); - } - } - - Properties consumerProps = new Properties(); - - // Assign default config. This has the lowest priority. - consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - consumerProps.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); - consumerProps.put(ConsumerConfig.CLIENT_ID_CONFIG, "kmf-consumer"); - consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "kmf-consumer-group-" + new Random().nextInt()); - consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); - consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); - - if (consumerClassName.equals(NewConsumer.class.getCanonicalName()) || consumerClassName.equals(NewConsumer.class.getSimpleName())) { - consumerClassName = NewConsumer.class.getCanonicalName(); - } else if (consumerClassName.equals(OldConsumer.class.getCanonicalName()) || consumerClassName.equals(OldConsumer.class.getSimpleName())) { - consumerClassName = OldConsumer.class.getCanonicalName(); - // The name/value of these configs are changed in the new consumer. - consumerProps.put("auto.commit.enable", "false"); - consumerProps.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "largest"); - } - - // Assign config specified for ConsumeService. - consumerProps.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); - consumerProps.put("zookeeper.connect", zkConnect); - - // Assign config specified for consumer. This has the highest priority. - consumerProps.putAll(consumerPropsOverride); - - _consumer = (KMBaseConsumer) Class.forName(consumerClassName).getConstructor(String.class, Properties.class).newInstance(topic, consumerProps); - - _thread = new Thread(new Runnable() { - @Override - public void run() { - try { - consume(); - } catch (Exception e) { - LOG.error(_name + "/ConsumeService failed", e); - } - } - }, _name + " consume-service"); - _thread.setDaemon(true); - - MetricConfig metricConfig = new MetricConfig().samples(60).timeWindow(1000, TimeUnit.MILLISECONDS); - List reporters = new ArrayList<>(); - reporters.add(new JmxReporter(JMX_PREFIX)); - Metrics metrics = new Metrics(metricConfig, reporters, new SystemTime()); - Map tags = new HashMap<>(); - tags.put("name", _name); - _sensors = new ConsumeMetrics(metrics, tags); - } - - private void consume() throws Exception { - // Delay 1 second to reduce the chance that consumer creates topic before TopicManagementService - Thread.sleep(1000); - - Map nextIndexes = new HashMap<>(); - - while (_running.get()) { - BaseConsumerRecord record; - try { - record = _consumer.receive(); - } catch (Exception e) { - _sensors._consumeError.record(); - LOG.warn(_name + "/ConsumeService failed to receive record", e); - // Avoid busy while loop - Thread.sleep(100); - continue; - } - - if (record == null) - continue; - - GenericRecord avroRecord = Utils.genericRecordFromJson(record.value()); - if (avroRecord == null) { - _sensors._consumeError.record(); - continue; - } - int partition = record.partition(); - long index = (Long) avroRecord.get(DefaultTopicSchema.INDEX_FIELD.name()); - long currMs = System.currentTimeMillis(); - long prevMs = (Long) avroRecord.get(DefaultTopicSchema.TIME_FIELD.name()); - _sensors._recordsConsumed.record(); - _sensors._bytesConsumed.record(record.value().length()); - _sensors._recordsDelay.record(currMs - prevMs); - - if (currMs - prevMs > _latencySlaMs) - _sensors._recordsDelayed.record(); - - if (index == -1L || !nextIndexes.containsKey(partition)) { - nextIndexes.put(partition, -1L); - continue; - } - - long nextIndex = nextIndexes.get(partition); - if (nextIndex == -1 || index == nextIndex) { - nextIndexes.put(partition, index + 1); - } else if (index < nextIndex) { - _sensors._recordsDuplicated.record(); - } else if (index > nextIndex) { - nextIndexes.put(partition, index + 1); - _sensors._recordsLost.record(index - nextIndex); - } - } - } - - @Override - public synchronized void start() { - if (_running.compareAndSet(false, true)) { - _thread.start(); - LOG.info("{}/ConsumeService started", _name); - } - } - - @Override - public synchronized void stop() { - if (_running.compareAndSet(true, false)) { - try { - _consumer.close(); - } catch (Exception e) { - LOG.warn(_name + "/ConsumeService while trying to close consumer.", e); - } - LOG.info("{}/ConsumeService stopped", _name); - } - } - - @Override - public void awaitShutdown() { - LOG.info("{}/ConsumeService shutdown completed", _name); - } - - @Override - public boolean isRunning() { - return _running.get() && _thread.isAlive(); - } - - private class ConsumeMetrics { - public final Metrics metrics; - private final Sensor _bytesConsumed; - private final Sensor _consumeError; - private final Sensor _recordsConsumed; - private final Sensor _recordsDuplicated; - private final Sensor _recordsLost; - private final Sensor _recordsDelay; - private final Sensor _recordsDelayed; - - public ConsumeMetrics(Metrics metrics, final Map tags) { - this.metrics = metrics; - - _bytesConsumed = metrics.sensor("bytes-consumed"); - _bytesConsumed.add(new MetricName("bytes-consumed-rate", METRIC_GROUP_NAME, "The average number of bytes per second that are consumed", tags), new Rate()); - - _consumeError = metrics.sensor("consume-error"); - _consumeError.add(new MetricName("consume-error-rate", METRIC_GROUP_NAME, "The average number of errors per second", tags), new Rate()); - _consumeError.add(new MetricName("consume-error-total", METRIC_GROUP_NAME, "The total number of errors", tags), new Total()); - - _recordsConsumed = metrics.sensor("records-consumed"); - _recordsConsumed.add(new MetricName("records-consumed-rate", METRIC_GROUP_NAME, "The average number of records per second that are consumed", tags), new Rate()); - _recordsConsumed.add(new MetricName("records-consumed-total", METRIC_GROUP_NAME, "The total number of records that are consumed", tags), new Total()); - - _recordsDuplicated = metrics.sensor("records-duplicated"); - _recordsDuplicated.add(new MetricName("records-duplicated-rate", METRIC_GROUP_NAME, "The average number of records per second that are duplicated", tags), new Rate()); - _recordsDuplicated.add(new MetricName("records-duplicated-total", METRIC_GROUP_NAME, "The total number of records that are duplicated", tags), new Total()); - - _recordsLost = metrics.sensor("records-lost"); - _recordsLost.add(new MetricName("records-lost-rate", METRIC_GROUP_NAME, "The average number of records per second that are lost", tags), new Rate()); - _recordsLost.add(new MetricName("records-lost-total", METRIC_GROUP_NAME, "The total number of records that are lost", tags), new Total()); - - _recordsDelayed = metrics.sensor("records-delayed"); - _recordsDelayed.add(new MetricName("records-delayed-rate", METRIC_GROUP_NAME, "The average number of records per second that are either lost or arrive after maximum allowed latency under SLA", tags), new Rate()); - _recordsDelayed.add(new MetricName("records-delayed-total", METRIC_GROUP_NAME, "The total number of records that are either lost or arrive after maximum allowed latency under SLA", tags), new Total()); - - _recordsDelay = metrics.sensor("records-delay"); - _recordsDelay.add(new MetricName("records-delay-ms-avg", METRIC_GROUP_NAME, "The average latency of records from producer to consumer", tags), new Avg()); - _recordsDelay.add(new MetricName("records-delay-ms-max", METRIC_GROUP_NAME, "The maximum latency of records from producer to consumer", tags), new Max()); - - // There are 2 extra buckets use for values smaller than 0.0 or larger than max, respectively. - int bucketNum = _latencyPercentileMaxMs / _latencyPercentileGranularityMs + 2; - int sizeInBytes = 4 * bucketNum; - _recordsDelay.add(new Percentiles(sizeInBytes, _latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT, - new Percentile(new MetricName("records-delay-ms-99th", METRIC_GROUP_NAME, "The 99th percentile latency of records from producer to consumer", tags), 99.0), - new Percentile(new MetricName("records-delay-ms-999th", METRIC_GROUP_NAME, "The 999th percentile latency of records from producer to consumer", tags), 99.9))); - - metrics.addMetric(new MetricName("consume-availability-avg", METRIC_GROUP_NAME, "The average consume availability", tags), - new Measurable() { - @Override - public double measure(MetricConfig config, long now) { - double recordsConsumedRate = _sensors.metrics.metrics().get(new MetricName("records-consumed-rate", METRIC_GROUP_NAME, tags)).value(); - double recordsLostRate = _sensors.metrics.metrics().get(new MetricName("records-lost-rate", METRIC_GROUP_NAME, tags)).value(); - double recordsDelayedRate = _sensors.metrics.metrics().get(new MetricName("records-delayed-rate", METRIC_GROUP_NAME, tags)).value(); - - if (new Double(recordsLostRate).isNaN()) - recordsLostRate = 0; - if (new Double(recordsDelayedRate).isNaN()) - recordsDelayedRate = 0; - - double consumeAvailability = recordsConsumedRate + recordsLostRate > 0 - ? (recordsConsumedRate - recordsDelayedRate) / (recordsConsumedRate + recordsLostRate) : 0; - - return consumeAvailability; - } - } - ); - } - - } - -} \ No newline at end of file diff --git a/src/main/java/com/linkedin/kmf/services/JettyService.java b/src/main/java/com/linkedin/kmf/services/JettyService.java deleted file mode 100644 index 42b74ae5..00000000 --- a/src/main/java/com/linkedin/kmf/services/JettyService.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.services; - -import com.linkedin.kmf.services.configs.JettyServiceConfig; -import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.ResourceHandler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Map; - -// Jetty server that serves html files. -public class JettyService implements Service { - private static final Logger LOG = LoggerFactory.getLogger(JettyService.class); - - private final String _name; - private final Server _jettyServer; - private final int _port; - - public JettyService(Map props, String name) { - _name = name; - JettyServiceConfig config = new JettyServiceConfig(props); - _port = config.getInt(JettyServiceConfig.PORT_CONFIG); - _jettyServer = new Server(_port); - ResourceHandler resourceHandler = new ResourceHandler(); - resourceHandler.setDirectoriesListed(true); - resourceHandler.setWelcomeFiles(new String[]{"index.html"}); - resourceHandler.setResourceBase("webapp"); - _jettyServer.setHandler(resourceHandler); - } - - public synchronized void start() { - try { - _jettyServer.start(); - LOG.info("{}/JettyService started at port {}", _name, _port); - } catch (Exception e) { - LOG.error(_name + "/JettyService failed to start", e); - } - } - - public synchronized void stop() { - try { - _jettyServer.stop(); - LOG.info("{}/JettyService stopped", _name); - } catch (Exception e) { - LOG.error(_name + "/JettyService failed to stop", e); - } - } - - public boolean isRunning() { - return _jettyServer.isRunning(); - } - - public void awaitShutdown() { - - } - -} diff --git a/src/main/java/com/linkedin/kmf/services/MultiClusterTopicManagementService.java b/src/main/java/com/linkedin/kmf/services/MultiClusterTopicManagementService.java deleted file mode 100644 index d869d41c..00000000 --- a/src/main/java/com/linkedin/kmf/services/MultiClusterTopicManagementService.java +++ /dev/null @@ -1,380 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ - -package com.linkedin.kmf.services; - -import com.linkedin.kmf.common.Utils; -import com.linkedin.kmf.services.configs.CommonServiceConfig; -import com.linkedin.kmf.services.configs.MultiClusterTopicManagementServiceConfig; -import com.linkedin.kmf.services.configs.TopicManagementServiceConfig; -import com.linkedin.kmf.topicfactory.TopicFactory; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.TimeUnit; -import kafka.admin.AdminOperationException; -import java.util.concurrent.atomic.AtomicBoolean; -import kafka.admin.AdminUtils; -import kafka.admin.BrokerMetadata; -import kafka.admin.PreferredReplicaLeaderElectionCommand; -import kafka.admin.RackAwareMode; -import kafka.cluster.Broker; -import kafka.common.TopicAndPartition; -import kafka.utils.ZkUtils; -import org.I0Itec.zkclient.exception.ZkNodeExistsException; -import org.apache.kafka.common.Node; -import org.apache.kafka.common.PartitionInfo; -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.common.security.JaasUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.collection.Seq; - -import static com.linkedin.kmf.common.Utils.ZK_CONNECTION_TIMEOUT_MS; -import static com.linkedin.kmf.common.Utils.ZK_SESSION_TIMEOUT_MS; - -/** - * This service periodically checks and rebalances the monitor topics across a pipeline of Kafka clusters so that - * leadership of the partitions of the monitor topic in each cluster is distributed evenly across brokers in the cluster. - * - * More specifically, this service may do some or all of the following tasks depending on the config: - * - * - Create the monitor topic using the user-specified replication factor and partition number - * - Increase partition number of the monitor topic if either partitionsToBrokersRatio or minPartitionNum is not satisfied - * - Increase replication factor of the monitor topic if the user-specified replicationFactor is not satisfied - * - Reassign partition across brokers to make sure each broker acts as preferred leader of at least one partition of the monitor topic - * - Trigger preferred leader election to make sure each broker acts as leader of at least one partition of the monitor topic. - * - Make sure the number of partitions of the monitor topic is same across all monitored custers. - * - */ -public class MultiClusterTopicManagementService implements Service { - private static final Logger LOG = LoggerFactory.getLogger(MultiClusterTopicManagementService.class); - - private final AtomicBoolean _isRunning = new AtomicBoolean(false); - private final String _serviceName; - private final Map _topicManagementByCluster; - private final int _scheduleIntervalMs; - private final ScheduledExecutorService _executor; - - public MultiClusterTopicManagementService(Map props, String serviceName) throws Exception { - _serviceName = serviceName; - MultiClusterTopicManagementServiceConfig config = new MultiClusterTopicManagementServiceConfig(props); - String topic = config.getString(CommonServiceConfig.TOPIC_CONFIG); - Map propsByCluster = props.containsKey(MultiClusterTopicManagementServiceConfig.PROPS_PER_CLUSTER_CONFIG) - ? (Map) props.get(MultiClusterTopicManagementServiceConfig.PROPS_PER_CLUSTER_CONFIG) : new HashMap<>(); - _topicManagementByCluster = initializeTopicManagementHelper(propsByCluster, topic); - _scheduleIntervalMs = config.getInt(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG); - _executor = Executors.newSingleThreadScheduledExecutor(new ThreadFactory() { - @Override - public Thread newThread(Runnable r) { - return new Thread(r, _serviceName + "-multi-cluster-topic-management-service"); - } - }); - } - - private Map initializeTopicManagementHelper(Map propsByCluster, String topic) throws Exception { - Map topicManagementByCluster = new HashMap<>(); - for (Map.Entry entry: propsByCluster.entrySet()) { - String clusterName = entry.getKey(); - Map serviceProps = entry.getValue(); - if (serviceProps.containsKey(MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG)) - throw new ConfigException("The raw per-cluster config for MultiClusterTopicManagementService must not contain " + - MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG); - serviceProps.put(MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG, topic); - topicManagementByCluster.put(clusterName, new TopicManagementHelper(serviceProps)); - } - return topicManagementByCluster; - } - - @Override - public synchronized void start() { - if (_isRunning.compareAndSet(false, true)) { - Runnable r = new TopicManagementRunnable(); - _executor.scheduleWithFixedDelay(r, 0, _scheduleIntervalMs, TimeUnit.MILLISECONDS); - LOG.info("{}/MultiClusterTopicManagementService started.", _serviceName); - } - } - - @Override - public synchronized void stop() { - if (_isRunning.compareAndSet(true, false)) { - _executor.shutdown(); - LOG.info("{}/MultiClusterTopicManagementService stopped.", _serviceName); - } - } - - @Override - public boolean isRunning() { - return _isRunning.get() && !_executor.isShutdown(); - } - - @Override - public void awaitShutdown() { - try { - _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - LOG.info("Thread interrupted when waiting for {}/MultiClusterTopicManagementService to shutdown", _serviceName); - } - LOG.info("{}/MultiClusterTopicManagementService shutdown completed", _serviceName); - } - - private class TopicManagementRunnable implements Runnable { - @Override - public void run() { - try { - for (TopicManagementHelper helper : _topicManagementByCluster.values()) { - helper.maybeCreateTopic(); - } - - /* - * The partition number of the monitor topics should be the minimum partition number that satisifies the following conditions: - * - partition number of the monitor topics across all monitored clusters should be the same - * - partitionNum / brokerNum >= user-configured partitionsToBrokersRatio. - * - partitionNum >= user-configured minPartitionNum - */ - - int minPartitionNum = 0; - for (TopicManagementHelper helper : _topicManagementByCluster.values()) { - minPartitionNum = Math.max(minPartitionNum, helper.minPartitionNum()); - } - for (TopicManagementHelper helper : _topicManagementByCluster.values()) { - helper.maybeAddPartitions(minPartitionNum); - } - - for (Map.Entry entry : _topicManagementByCluster.entrySet()) { - String clusterName = entry.getKey(); - TopicManagementHelper helper = entry.getValue(); - try { - helper.maybeReassignPartitionAndElectLeader(); - } catch (IOException | ZkNodeExistsException | AdminOperationException e) { - LOG.warn(_serviceName + "/MultiClusterTopicManagementService will retry later in cluster " + clusterName, e); - } - } - } catch (Exception e) { - LOG.error(_serviceName + "/MultiClusterTopicManagementService will stop due to error.", e); - stop(); - } - } - } - - static class TopicManagementHelper { - - private final boolean _topicCreationEnabled; - private final String _topic; - private final String _zkConnect; - private final int _replicationFactor; - private final double _minPartitionsToBrokersRatio; - private final int _minPartitionNum; - private final TopicFactory _topicFactory; - private final Properties _topicProperties; - - TopicManagementHelper(Map props) throws Exception { - TopicManagementServiceConfig config = new TopicManagementServiceConfig(props); - _topicCreationEnabled = config.getBoolean(TopicManagementServiceConfig.TOPIC_CREATION_ENABLED_CONFIG); - _topic = config.getString(TopicManagementServiceConfig.TOPIC_CONFIG); - _zkConnect = config.getString(TopicManagementServiceConfig.ZOOKEEPER_CONNECT_CONFIG); - _replicationFactor = config.getInt(TopicManagementServiceConfig.TOPIC_REPLICATION_FACTOR_CONFIG); - _minPartitionsToBrokersRatio = config.getDouble(TopicManagementServiceConfig.PARTITIONS_TO_BROKERS_RATIO_CONFIG); - _minPartitionNum = config.getInt(TopicManagementServiceConfig.MIN_PARTITION_NUM_CONFIG); - String topicFactoryClassName = config.getString(TopicManagementServiceConfig.TOPIC_FACTORY_CLASS_CONFIG); - _topicProperties = new Properties(); - if (props.containsKey(TopicManagementServiceConfig.TOPIC_PROPS_CONFIG)) - _topicProperties.putAll((Map) props.get(TopicManagementServiceConfig.TOPIC_PROPS_CONFIG)); - - Map topicFactoryConfig = props.containsKey(TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) ? - (Map) props.get(TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) : new HashMap(); - _topicFactory = (TopicFactory) Class.forName(topicFactoryClassName).getConstructor(Map.class).newInstance(topicFactoryConfig); - } - - void maybeCreateTopic() throws Exception { - if (_topicCreationEnabled) { - _topicFactory.createTopicIfNotExist(_zkConnect, _topic, _replicationFactor, _minPartitionsToBrokersRatio, _topicProperties); - } - } - - int minPartitionNum() { - int brokerCount = Utils.getBrokerCount(_zkConnect); - return Math.max((int) Math.ceil(_minPartitionsToBrokersRatio * brokerCount), _minPartitionNum); - } - - void maybeAddPartitions(int minPartitionNum) { - ZkUtils zkUtils = ZkUtils.apply(_zkConnect, ZK_SESSION_TIMEOUT_MS, ZK_CONNECTION_TIMEOUT_MS, JaasUtils.isZkSecurityEnabled()); - try { - int partitionNum = getPartitionInfo(zkUtils, _topic).size(); - if (partitionNum < minPartitionNum) { - LOG.info("MultiClusterTopicManagementService will increase partition of the topic {} " - + "in cluster {} from {} to {}.", _topic, _zkConnect, partitionNum, minPartitionNum); - AdminUtils.addPartitions(zkUtils, _topic, minPartitionNum, null, false, RackAwareMode.Enforced$.MODULE$); - } - } finally { - zkUtils.close(); - } - } - - void maybeReassignPartitionAndElectLeader() throws Exception { - ZkUtils zkUtils = ZkUtils.apply(_zkConnect, ZK_SESSION_TIMEOUT_MS, ZK_CONNECTION_TIMEOUT_MS, JaasUtils.isZkSecurityEnabled()); - - try { - List partitionInfoList = getPartitionInfo(zkUtils, _topic); - Collection brokers = scala.collection.JavaConversions.asJavaCollection(zkUtils.getAllBrokersInCluster()); - - if (partitionInfoList.size() == 0) - throw new IllegalStateException("Topic " + _topic + " does not exist in cluster " + _zkConnect); - - int currentReplicationFactor = getReplicationFactor(partitionInfoList); - - if (_replicationFactor < currentReplicationFactor) - throw new RuntimeException(String.format("Configured replication factor %d " - + "is smaller than the current replication factor %d of the topic %s in cluster %s", - _replicationFactor, currentReplicationFactor, _topic, _zkConnect)); - - if (_replicationFactor > currentReplicationFactor && zkUtils.getPartitionsBeingReassigned().isEmpty()) { - LOG.info("MultiClusterTopicManagementService will increase the replication factor of the topic {} in cluster {}", _topic, _zkConnect); - reassignPartitions(zkUtils, brokers, _topic, partitionInfoList.size(), _replicationFactor); - } - - if (partitionInfoList.size() >= brokers.size() && - someBrokerNotPreferredLeader(partitionInfoList, brokers) && - zkUtils.getPartitionsBeingReassigned().isEmpty()) { - LOG.info("MultiClusterTopicManagementService will reassign partitions of the topic {} in cluster {}", _topic, _zkConnect); - reassignPartitions(zkUtils, brokers, _topic, partitionInfoList.size(), _replicationFactor); - } - - if (partitionInfoList.size() >= brokers.size() && - someBrokerNotElectedLeader(partitionInfoList, brokers)) { - LOG.info("MultiClusterTopicManagementService will trigger preferred leader election for the topic {} in cluster {}", _topic, _zkConnect); - triggerPreferredLeaderElection(zkUtils, partitionInfoList); - } - } finally { - zkUtils.close(); - } - } - - private static void triggerPreferredLeaderElection(ZkUtils zkUtils, List partitionInfoList) { - scala.collection.mutable.HashSet scalaPartitionInfoSet = new scala.collection.mutable.HashSet<>(); - for (PartitionInfo javaPartitionInfo : partitionInfoList) { - scalaPartitionInfoSet.add(new TopicAndPartition(javaPartitionInfo.topic(), javaPartitionInfo.partition())); - } - PreferredReplicaLeaderElectionCommand.writePreferredReplicaElectionData(zkUtils, scalaPartitionInfoSet); - } - - private static void reassignPartitions(ZkUtils zkUtils, Collection brokers, String topic, int partitionCount, int replicationFactor) { - scala.collection.mutable.ArrayBuffer brokersMetadata = new scala.collection.mutable.ArrayBuffer<>(brokers.size()); - for (Broker broker : brokers) { - brokersMetadata.$plus$eq(new BrokerMetadata(broker.id(), broker.rack())); - } - scala.collection.Map> partitionToReplicas = - AdminUtils.assignReplicasToBrokers(brokersMetadata, partitionCount, replicationFactor, 0, 0); - String jsonReassignmentData = formatAsReassignmentJson(topic, partitionToReplicas); - zkUtils.createPersistentPath(ZkUtils.ReassignPartitionsPath(), jsonReassignmentData, zkUtils.DefaultAcls()); - } - - private static List getPartitionInfo(ZkUtils zkUtils, String topic) { - scala.collection.mutable.ArrayBuffer topicList = new scala.collection.mutable.ArrayBuffer<>(); - topicList.$plus$eq(topic); - scala.collection.Map> partitionAssignments = - zkUtils.getPartitionAssignmentForTopics(topicList).apply(topic); - List partitionInfoList = new ArrayList<>(); - scala.collection.Iterator>> it = partitionAssignments.iterator(); - while (it.hasNext()) { - scala.Tuple2> scalaTuple = it.next(); - Integer partition = (Integer) scalaTuple._1(); - scala.Option leaderOption = zkUtils.getLeaderForPartition(topic, partition); - Node leader = leaderOption.isEmpty() ? null : new Node((Integer) leaderOption.get(), "", -1); - Node[] replicas = new Node[scalaTuple._2().size()]; - for (int i = 0; i < replicas.length; i++) { - Integer brokerId = (Integer) scalaTuple._2().apply(i); - replicas[i] = new Node(brokerId, "", -1); - } - partitionInfoList.add(new PartitionInfo(topic, partition, leader, replicas, null)); - } - - return partitionInfoList; - } - - static int getReplicationFactor(List partitionInfoList) { - if (partitionInfoList.isEmpty()) - throw new RuntimeException("Partition list is empty"); - - int replicationFactor = partitionInfoList.get(0).replicas().length; - for (PartitionInfo partitionInfo : partitionInfoList) { - if (replicationFactor != partitionInfo.replicas().length) { - String topic = partitionInfoList.get(0).topic(); - throw new RuntimeException("Partitions of the topic " + topic + " have different replication factor"); - } - } - return replicationFactor; - } - - static boolean someBrokerNotPreferredLeader(List partitionInfoList, Collection brokers) { - Set brokersNotPreferredLeader = new HashSet<>(brokers.size()); - for (Broker broker: brokers) - brokersNotPreferredLeader.add(broker.id()); - for (PartitionInfo partitionInfo : partitionInfoList) - brokersNotPreferredLeader.remove(partitionInfo.replicas()[0].id()); - - return !brokersNotPreferredLeader.isEmpty(); - } - - static boolean someBrokerNotElectedLeader(List partitionInfoList, Collection brokers) { - Set brokersNotElectedLeader = new HashSet<>(brokers.size()); - for (Broker broker: brokers) - brokersNotElectedLeader.add(broker.id()); - for (PartitionInfo partitionInfo : partitionInfoList) { - if (partitionInfo.leader() != null) - brokersNotElectedLeader.remove(partitionInfo.leader().id()); - } - return !brokersNotElectedLeader.isEmpty(); - } - - /** - * @param topic topic - * @param partitionsToBeReassigned a map from partition (int) to replica list (int seq) - * - * @return a json string with the same format as output of kafka.utils.ZkUtils.formatAsReassignmentJson - * - * Example: - *
-     *   {"version":1,"partitions":[
-     *     {"topic":"kmf-topic","partition":1,"replicas":[0,1]},
-     *     {"topic":"kmf-topic","partition":2,"replicas":[1,2]},
-     *     {"topic":"kmf-topic","partition":0,"replicas":[2,0]}]}
-     * 
- */ - private static String formatAsReassignmentJson(String topic, scala.collection.Map> partitionsToBeReassigned) { - StringBuilder bldr = new StringBuilder(); - bldr.append("{\"version\":1,\"partitions\":[\n"); - for (int partition = 0; partition < partitionsToBeReassigned.size(); partition++) { - bldr.append(" {\"topic\":\"").append(topic).append("\",\"partition\":").append(partition).append(",\"replicas\":["); - scala.collection.Seq replicas = partitionsToBeReassigned.apply(partition); - for (int replicaIndex = 0; replicaIndex < replicas.size(); replicaIndex++) { - Object replica = replicas.apply(replicaIndex); - bldr.append(replica).append(","); - } - bldr.setLength(bldr.length() - 1); - bldr.append("]},\n"); - } - bldr.setLength(bldr.length() - 2); - bldr.append("]}"); - return bldr.toString(); - } - - } -} - diff --git a/src/main/java/com/linkedin/kmf/topicfactory/DefaultTopicFactory.java b/src/main/java/com/linkedin/kmf/topicfactory/DefaultTopicFactory.java deleted file mode 100644 index 5158d24d..00000000 --- a/src/main/java/com/linkedin/kmf/topicfactory/DefaultTopicFactory.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.topicfactory; - -import com.linkedin.kmf.common.Utils; - -import java.util.Map; -import java.util.Properties; - - -public class DefaultTopicFactory implements TopicFactory { - - /** This constructor is required by TopicFactory but does nothing. */ - public DefaultTopicFactory(Map config) { - } - - @Override - public int createTopicIfNotExist(String zkUrl, String topic, int replicationFactor, double partitionToBrokerRatio, Properties topicConfig) { - return Utils.createMonitoringTopicIfNotExists(zkUrl, topic, replicationFactor, partitionToBrokerRatio, topicConfig); - } -} diff --git a/src/main/java/com/linkedin/kmf/KafkaMonitor.java b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java similarity index 51% rename from src/main/java/com/linkedin/kmf/KafkaMonitor.java rename to src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java index daa623c4..d516b076 100644 --- a/src/main/java/com/linkedin/kmf/KafkaMonitor.java +++ b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,42 +7,39 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.linkedin.kmf.services.Service; -import com.linkedin.kmf.apps.App; +package com.linkedin.xinfra.monitor; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.xinfra.monitor.apps.App; +import com.linkedin.xinfra.monitor.services.Service; +import com.linkedin.xinfra.monitor.services.ServiceFactory; +import java.io.BufferedReader; +import java.io.FileReader; +import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.common.metrics.JmxReporter; -import org.apache.kafka.common.metrics.Measurable; import org.apache.kafka.common.metrics.MetricConfig; import org.apache.kafka.common.metrics.Metrics; import org.apache.kafka.common.metrics.MetricsReporter; import org.apache.kafka.common.utils.SystemTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.FileReader; -import java.util.Map; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; - /** * This is the main entry point of the monitor. It reads the configuration and manages the life cycle of the monitoring * applications. */ -public class KafkaMonitor { - private static final Logger LOG = LoggerFactory.getLogger(KafkaMonitor.class); - public static final String CLASS_NAME_CONFIG = "class.name"; - private static final String METRIC_GROUP_NAME = "kafka-monitor"; - private static final String JMX_PREFIX = "kmf"; +public class XinfraMonitor { + private static final Logger LOG = LoggerFactory.getLogger(XinfraMonitor.class); /** This is concurrent because healthCheck() can modify this map, but awaitShutdown() can be called at any time by * a different thread. @@ -51,27 +48,39 @@ public class KafkaMonitor { private final ConcurrentMap _services; private final ConcurrentMap _offlineRunnables; private final ScheduledExecutorService _executor; - /** When true start has been called on this instance of Kafka monitor. */ + /** When true start has been called on this instance of Xinfra Monitor. */ private final AtomicBoolean _isRunning = new AtomicBoolean(false); - public KafkaMonitor(Map testProps) throws Exception { + /** + * XinfraMonitor constructor creates apps and services for each of the individual clusters (properties) that's passed in. + * For example, if there are 10 clusters to be monitored, then this Constructor will create 10 * num_apps_per_cluster + * and 10 * num_services_per_cluster. + * @param allClusterProps the properties of ALL kafka clusters for which apps and services need to be appended. + * @throws Exception when exception occurs while assigning Apps and Services + */ + + @SuppressWarnings({"rawtypes"}) + public XinfraMonitor(Map allClusterProps) throws Exception { _apps = new ConcurrentHashMap<>(); _services = new ConcurrentHashMap<>(); - for (Map.Entry entry : testProps.entrySet()) { - String name = entry.getKey(); - Map props = entry.getValue(); - if (!props.containsKey(CLASS_NAME_CONFIG)) - throw new IllegalArgumentException(name + " is not configured with " + CLASS_NAME_CONFIG); - String className = (String) props.get(CLASS_NAME_CONFIG); - - Class cls = Class.forName(className); - if (App.class.isAssignableFrom(cls)) { - App test = (App) Class.forName(className).getConstructor(Map.class, String.class).newInstance(props, name); - _apps.put(name, test); - } else if (Service.class.isAssignableFrom(cls)) { - Service service = (Service) Class.forName(className).getConstructor(Map.class, String.class).newInstance(props, name); - _services.put(name, service); + for (Map.Entry clusterProperty : allClusterProps.entrySet()) { + String clusterName = clusterProperty.getKey(); + Map props = clusterProperty.getValue(); + if (!props.containsKey(XinfraMonitorConstants.CLASS_NAME_CONFIG)) + throw new IllegalArgumentException(clusterName + " is not configured with " + XinfraMonitorConstants.CLASS_NAME_CONFIG); + String className = (String) props.get(XinfraMonitorConstants.CLASS_NAME_CONFIG); + + Class aClass = Class.forName(className); + if (App.class.isAssignableFrom(aClass)) { + App clusterApp = (App) Class.forName(className).getConstructor(Map.class, String.class).newInstance(props, clusterName); + _apps.put(clusterName, clusterApp); + } else if (Service.class.isAssignableFrom(aClass)) { + ServiceFactory serviceFactory = (ServiceFactory) Class.forName(className + XinfraMonitorConstants.FACTORY) + .getConstructor(Map.class, String.class) + .newInstance(props, clusterName); + Service service = serviceFactory.createService(); + _services.put(clusterName, service); } else { throw new IllegalArgumentException(className + " should implement either " + App.class.getSimpleName() + " or " + Service.class.getSimpleName()); } @@ -79,19 +88,22 @@ public KafkaMonitor(Map testProps) throws Exception { _executor = Executors.newSingleThreadScheduledExecutor(); _offlineRunnables = new ConcurrentHashMap<>(); List reporters = new ArrayList<>(); - reporters.add(new JmxReporter(JMX_PREFIX)); + reporters.add(new JmxReporter(XinfraMonitorConstants.JMX_PREFIX)); Metrics metrics = new Metrics(new MetricConfig(), reporters, new SystemTime()); - metrics.addMetric(metrics.metricName("offline-runnable-count", METRIC_GROUP_NAME, "The number of Service/App that are not fully running"), - new Measurable() { - @Override - public double measure(MetricConfig config, long now) { - return _offlineRunnables.size(); - } - } - ); + metrics.addMetric(metrics.metricName("offline-runnable-count", XinfraMonitorConstants.METRIC_GROUP_NAME, "The number of Service/App that are not fully running"), + (config, now) -> _offlineRunnables.size()); + } + + private boolean constructorContainsClass(Constructor[] constructors, Class classObject) { + for (int n = 0; n < constructors[0].getParameterTypes().length; ++n) { + if (constructors[0].getParameterTypes()[n].equals(classObject)) { + return true; + } + } + return false; } - public synchronized void start() { + public synchronized void start() throws Exception { if (!_isRunning.compareAndSet(false, true)) { return; } @@ -102,34 +114,37 @@ public synchronized void start() { entry.getValue().start(); } - _executor.scheduleAtFixedRate( - new Runnable() { - @Override - public void run() { - try { - checkHealth(); - } catch (Exception e) { - LOG.error("Failed to check health of tests and services", e); - } - } - }, 5, 5, TimeUnit.SECONDS + long initialDelaySecond = 5; + long periodSecond = 5; + + _executor.scheduleAtFixedRate(() -> { + try { + checkHealth(); + } catch (Exception e) { + LOG.error("Failed to check health of apps and services", e); + } + }, initialDelaySecond, periodSecond, TimeUnit.SECONDS ); } private void checkHealth() { for (Map.Entry entry: _apps.entrySet()) { - if (!entry.getValue().isRunning()) { + if (!entry.getValue().isRunning()) _offlineRunnables.putIfAbsent(entry.getKey(), entry.getValue()); - LOG.error("App " + entry.getKey() + " is not fully running."); - } } for (Map.Entry entry: _services.entrySet()) { - if (!entry.getValue().isRunning()) { + if (!entry.getValue().isRunning()) _offlineRunnables.putIfAbsent(entry.getKey(), entry.getValue()); + } + + for (Map.Entry entry: _offlineRunnables.entrySet()) { + if (entry.getValue() instanceof App) + LOG.error("App " + entry.getKey() + " is not fully running."); + else LOG.error("Service " + entry.getKey() + " is not fully running."); - } } + } public synchronized void stop() { @@ -137,26 +152,26 @@ public synchronized void stop() { return; } _executor.shutdownNow(); - for (App test: _apps.values()) - test.stop(); + for (App app: _apps.values()) + app.stop(); for (Service service: _services.values()) service.stop(); } public void awaitShutdown() { - for (App test: _apps.values()) - test.awaitShutdown(); + for (App app: _apps.values()) + app.awaitShutdown(); for (Service service: _services.values()) - service.awaitShutdown(); + service.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } + @SuppressWarnings("rawtypes") public static void main(String[] args) throws Exception { if (args.length <= 0) { - LOG.info("USAGE: java [options] " + KafkaMonitor.class.getName() + " config/kafka-monitor.properties"); + LOG.info("USAGE: java [options] " + XinfraMonitor.class.getName() + " config/xinfra-monitor.properties"); return; } - StringBuilder buffer = new StringBuilder(); try (BufferedReader br = new BufferedReader(new FileReader(args[0].trim()))) { String line; @@ -168,11 +183,11 @@ public static void main(String[] args) throws Exception { @SuppressWarnings("unchecked") Map props = new ObjectMapper().readValue(buffer.toString(), Map.class); - KafkaMonitor kafkaMonitor = new KafkaMonitor(props); - kafkaMonitor.start(); - LOG.info("KafkaMonitor started"); + XinfraMonitor xinfraMonitor = new XinfraMonitor(props); + xinfraMonitor.start(); + LOG.info("Xinfra Monitor has started."); - kafkaMonitor.awaitShutdown(); + xinfraMonitor.awaitShutdown(); } } diff --git a/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitorConstants.java b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitorConstants.java new file mode 100644 index 00000000..f22c63c9 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitorConstants.java @@ -0,0 +1,45 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor; + +/** + * Constant variables in Xinfra Monitor repo. + */ +public class XinfraMonitorConstants { + + public XinfraMonitorConstants() { + + } + + public static final String TAGS_NAME = "name"; + + public static final String FALSE = "false"; + + public static final String XINFRA_MONITOR_PREFIX = "xinfra-monitor-"; + + public static final String TOPIC_MANIPULATION_SERVICE_TOPIC = + "xinfra-monitor-cluster-topic-manipulation-service-topic-"; + + public static final String KAFKA_LOG_DIRECTORY = "/tmp/kafka-logs"; + + public static final int TOPIC_MANIPULATION_TOPIC_NUM_PARTITIONS = 3; + + static final String FACTORY = "Factory"; + + static final String CLASS_NAME_CONFIG = "class.name"; + + public static final String METRIC_GROUP_NAME = "kafka-monitor"; + + public static final String JMX_PREFIX = "kmf"; + + public static final String METRIC_GROUP_NAME_PRODUCE_SERVICE = "produce-service"; + +} diff --git a/src/main/java/com/linkedin/kmf/apps/App.java b/src/main/java/com/linkedin/xinfra/monitor/apps/App.java similarity index 78% rename from src/main/java/com/linkedin/kmf/apps/App.java rename to src/main/java/com/linkedin/xinfra/monitor/apps/App.java index c1ac8ee0..cdc44be0 100644 --- a/src/main/java/com/linkedin/kmf/apps/App.java +++ b/src/main/java/com/linkedin/xinfra/monitor/apps/App.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,11 +7,12 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.apps; + +package com.linkedin.xinfra.monitor.apps; public interface App { - void start(); + void start() throws Exception; void stop(); diff --git a/src/main/java/com/linkedin/kmf/apps/MultiClusterMonitor.java b/src/main/java/com/linkedin/xinfra/monitor/apps/MultiClusterMonitor.java similarity index 65% rename from src/main/java/com/linkedin/kmf/apps/MultiClusterMonitor.java rename to src/main/java/com/linkedin/xinfra/monitor/apps/MultiClusterMonitor.java index 0ae0332c..f4aa8c1e 100644 --- a/src/main/java/com/linkedin/kmf/apps/MultiClusterMonitor.java +++ b/src/main/java/com/linkedin/xinfra/monitor/apps/MultiClusterMonitor.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,14 +7,18 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.apps; -import com.linkedin.kmf.apps.configs.MultiClusterMonitorConfig; -import com.linkedin.kmf.services.ConsumeService; -import com.linkedin.kmf.services.MultiClusterTopicManagementService; -import com.linkedin.kmf.services.ProduceService; +package com.linkedin.xinfra.monitor.apps; + +import com.linkedin.xinfra.monitor.apps.configs.MultiClusterMonitorConfig; +import com.linkedin.xinfra.monitor.services.ConsumeService; +import com.linkedin.xinfra.monitor.services.ConsumerFactoryImpl; +import com.linkedin.xinfra.monitor.services.MultiClusterTopicManagementService; +import com.linkedin.xinfra.monitor.services.ProduceService; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,10 +30,11 @@ * across Kafka clusters and make sure they have the same number of partitions. */ +@SuppressWarnings("rawtypes") public class MultiClusterMonitor implements App { private static final Logger LOG = LoggerFactory.getLogger(MultiClusterMonitor.class); - private final MultiClusterTopicManagementService _topicManagementService; + private final MultiClusterTopicManagementService _multiClusterTopicManagementService; private final ProduceService _produceService; private final ConsumeService _consumeService; private final String _name; @@ -37,9 +42,11 @@ public class MultiClusterMonitor implements App { public MultiClusterMonitor(Map props, String name) throws Exception { _name = name; MultiClusterMonitorConfig config = new MultiClusterMonitorConfig(props); - _topicManagementService = new MultiClusterTopicManagementService(createMultiClusterTopicManagementServiceProps(props, config), name); + _multiClusterTopicManagementService = new MultiClusterTopicManagementService(createMultiClusterTopicManagementServiceProps(props, config), name); + CompletableFuture topicPartitionReady = _multiClusterTopicManagementService.topicPartitionResult(); _produceService = new ProduceService(createProduceServiceProps(props, config), name); - _consumeService = new ConsumeService(createConsumeServiceProps(props, config), name); + ConsumerFactoryImpl consumerFactory = new ConsumerFactoryImpl(createConsumeServiceProps(props, config)); + _consumeService = new ConsumeService(name, topicPartitionReady, consumerFactory); } @SuppressWarnings("unchecked") @@ -60,7 +67,6 @@ private Map createConsumeServiceProps(Map props, return serviceProps; } - @SuppressWarnings("unchecked") private Map createMultiClusterTopicManagementServiceProps(Map props, MultiClusterMonitorConfig config) { Map serviceProps = new HashMap<>(); serviceProps.put(MultiClusterMonitorConfig.TOPIC_MANAGEMENT_SERVICE_CONFIG, props.get(MultiClusterMonitorConfig.TOPIC_MANAGEMENT_SERVICE_CONFIG)); @@ -70,15 +76,18 @@ private Map createMultiClusterTopicManagementServiceProps(Map topicPartitionResult = _multiClusterTopicManagementService.topicPartitionResult(); + topicPartitionResult.thenRun(() -> { + _produceService.start(); + _consumeService.start(); + }); + LOG.info(_name + "/MultiClusterMonitor started."); } @Override public void stop() { - _topicManagementService.stop(); + _multiClusterTopicManagementService.stop(); _produceService.stop(); _consumeService.stop(); LOG.info(_name + "/MultiClusterMonitor stopped"); @@ -86,13 +95,13 @@ public void stop() { @Override public boolean isRunning() { - return _topicManagementService.isRunning() && _produceService.isRunning() && _consumeService.isRunning(); + return _multiClusterTopicManagementService.isRunning() && _produceService.isRunning() && _consumeService.isRunning(); } @Override public void awaitShutdown() { - _topicManagementService.awaitShutdown(); - _produceService.awaitShutdown(); - _consumeService.awaitShutdown(); + _multiClusterTopicManagementService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + _produceService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + _consumeService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } } diff --git a/src/main/java/com/linkedin/kmf/apps/SingleClusterMonitor.java b/src/main/java/com/linkedin/xinfra/monitor/apps/SingleClusterMonitor.java similarity index 51% rename from src/main/java/com/linkedin/kmf/apps/SingleClusterMonitor.java rename to src/main/java/com/linkedin/xinfra/monitor/apps/SingleClusterMonitor.java index d118930f..a44b0827 100644 --- a/src/main/java/com/linkedin/kmf/apps/SingleClusterMonitor.java +++ b/src/main/java/com/linkedin/xinfra/monitor/apps/SingleClusterMonitor.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,32 +7,39 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.apps; - -import com.linkedin.kmf.services.TopicManagementService; -import com.linkedin.kmf.services.configs.ConsumeServiceConfig; -import com.linkedin.kmf.services.configs.DefaultMetricsReporterServiceConfig; -import com.linkedin.kmf.services.configs.MultiClusterTopicManagementServiceConfig; -import com.linkedin.kmf.services.configs.ProduceServiceConfig; -import com.linkedin.kmf.services.ConsumeService; -import com.linkedin.kmf.services.JettyService; -import com.linkedin.kmf.services.JolokiaService; -import com.linkedin.kmf.services.DefaultMetricsReporterService; -import com.linkedin.kmf.services.ProduceService; -import com.linkedin.kmf.services.configs.TopicManagementServiceConfig; + +package com.linkedin.xinfra.monitor.apps; + +import com.linkedin.xinfra.monitor.services.ConsumeService; +import com.linkedin.xinfra.monitor.services.ConsumerFactory; +import com.linkedin.xinfra.monitor.services.ConsumerFactoryImpl; +import com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService; +import com.linkedin.xinfra.monitor.services.JolokiaService; +import com.linkedin.xinfra.monitor.services.ProduceService; +import com.linkedin.xinfra.monitor.services.Service; +import com.linkedin.xinfra.monitor.services.TopicManagementService; +import com.linkedin.xinfra.monitor.services.configs.ConsumeServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.DefaultMetricsReporterServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.ProduceServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.metrics.ClusterTopicManipulationMetrics; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; import net.sourceforge.argparse4j.ArgumentParsers; import net.sourceforge.argparse4j.inf.ArgumentParser; import net.sourceforge.argparse4j.inf.Namespace; +import org.apache.kafka.common.utils.Utils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.kafka.common.utils.Utils; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import static net.sourceforge.argparse4j.impl.Arguments.store; +import static com.linkedin.xinfra.monitor.common.Utils.prettyPrint; /* * The SingleClusterMonitor app is intended to monitor the performance and availability of a given Kafka cluster. It creates @@ -46,44 +53,109 @@ public class SingleClusterMonitor implements App { private static final Logger LOG = LoggerFactory.getLogger(SingleClusterMonitor.class); + private static final int SERVICES_INITIAL_CAPACITY = 4; private final TopicManagementService _topicManagementService; - private final ProduceService _produceService; - private final ConsumeService _consumeService; - private final String _name; - - public SingleClusterMonitor(Map props, String name) throws Exception { - _name = name; - _topicManagementService = new TopicManagementService(props, name); - _produceService = new ProduceService(props, name); - _consumeService = new ConsumeService(props, name); + private final String _clusterName; + private final List _allServices; + private final boolean _isTopicManagementServiceEnabled; + + public SingleClusterMonitor(Map props, String clusterName) throws Exception { + ConsumerFactory consumerFactory = new ConsumerFactoryImpl(props); + _clusterName = clusterName; + LOG.info("SingleClusterMonitor properties: {}", prettyPrint(props)); + TopicManagementServiceConfig config = new TopicManagementServiceConfig(props); + _isTopicManagementServiceEnabled = + config.getBoolean(TopicManagementServiceConfig.TOPIC_MANAGEMENT_ENABLED_CONFIG); + _allServices = new ArrayList<>(SERVICES_INITIAL_CAPACITY); + CompletableFuture topicPartitionResult; + if (_isTopicManagementServiceEnabled) { + String topicManagementServiceName = String.format("Topic-management-service-for-%s", clusterName); + _topicManagementService = new TopicManagementService(props, topicManagementServiceName); + topicPartitionResult = _topicManagementService.topicPartitionResult(); + + // block on the MultiClusterTopicManagementService to complete. + topicPartitionResult.get(); + + _allServices.add(_topicManagementService); + } else { + _topicManagementService = null; + topicPartitionResult = new CompletableFuture<>(); + topicPartitionResult.complete(null); + } + ProduceService produceService = new ProduceService(props, clusterName); + ConsumeService consumeService = new ConsumeService(clusterName, topicPartitionResult, consumerFactory); + _allServices.add(produceService); + _allServices.add(consumeService); } @Override - public void start() { - _topicManagementService.start(); - _produceService.start(); - _consumeService.start(); - LOG.info(_name + "/SingleClusterMonitor started"); + public void start() throws Exception { + if (_isTopicManagementServiceEnabled) { + _topicManagementService.start(); + CompletableFuture topicPartitionResult = _topicManagementService.topicPartitionResult(); + + try { + /* Delay 2 second to reduce the chance that produce and consumer thread has race condition + with TopicManagementService and MultiClusterTopicManagementService */ + long threadSleepMs = TimeUnit.SECONDS.toMillis(2); + Thread.sleep(threadSleepMs); + } catch (InterruptedException e) { + throw new Exception("Interrupted while sleeping the thread", e); + } + CompletableFuture topicPartitionFuture = topicPartitionResult.thenRun(() -> { + for (Service service : _allServices) { + if (!service.isRunning()) { + LOG.debug("Now starting {}", service.getServiceName()); + service.start(); + } + } + }); + + try { + topicPartitionFuture.get(); + } catch (InterruptedException | ExecutionException e) { + throw new Exception("Exception occurred while getting the TopicPartitionFuture", e); + } + + } else { + for (Service service : _allServices) { + if (!service.isRunning()) { + LOG.debug("Now starting {}", service.getServiceName()); + service.start(); + } + } + } + + LOG.info(_clusterName + "/SingleClusterMonitor started!"); } @Override public void stop() { - _topicManagementService.stop(); - _produceService.stop(); - _consumeService.stop(); - LOG.info(_name + "/SingleClusterMonitor stopped"); + for (Service service : _allServices) { + service.stop(); + } + LOG.info(_clusterName + "/SingleClusterMonitor stopped."); } @Override public boolean isRunning() { - return _topicManagementService.isRunning() && _produceService.isRunning() && _consumeService.isRunning(); + boolean isRunning = true; + + for (Service service : _allServices) { + if (!service.isRunning()) { + isRunning = false; + LOG.info("{} is not running.", service.getServiceName()); + } + } + + return isRunning; } @Override public void awaitShutdown() { - _topicManagementService.awaitShutdown(); - _produceService.awaitShutdown(); - _consumeService.awaitShutdown(); + for (Service service : _allServices) { + service.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + } } /** Get the command-line argument parser. */ @@ -94,7 +166,7 @@ private static ArgumentParser argParser() { .description(""); parser.addArgument("--topic") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("TOPIC") @@ -102,14 +174,14 @@ private static ArgumentParser argParser() { .help("Produce messages to this topic and consume message from this topic"); parser.addArgument("--producer-id") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .dest("producerId") .help("The producerId will be used by producer client and encoded in the messages to the topic"); parser.addArgument("--broker-list") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(true) .type(String.class) .metavar("HOST1:PORT1[,HOST2:PORT2[...]]") @@ -117,7 +189,7 @@ private static ArgumentParser argParser() { .help("Comma-separated list of Kafka brokers in the form HOST1:PORT1,HOST2:PORT2,..."); parser.addArgument("--zookeeper") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(true) .type(String.class) .metavar("HOST:PORT") @@ -125,7 +197,7 @@ private static ArgumentParser argParser() { .help("The connection string for the zookeeper connection in the form host:port"); parser.addArgument("--record-size") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("RECORD_SIZE") @@ -133,7 +205,7 @@ private static ArgumentParser argParser() { .help("The size of each record."); parser.addArgument("--producer-class") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("PRODUCER_CLASS_NAME") @@ -141,7 +213,7 @@ private static ArgumentParser argParser() { .help("Specify the class of producer. Available choices include newProducer or class name"); parser.addArgument("--consumer-class") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("CONSUMER_CLASS_NAME") @@ -149,7 +221,7 @@ private static ArgumentParser argParser() { .help("Specify the class of consumer. Available choices include oldConsumer, newConsumer, or class name"); parser.addArgument("--producer.config") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("PRODUCER_CONFIG") @@ -157,7 +229,7 @@ private static ArgumentParser argParser() { .help("Producer config properties file."); parser.addArgument("--consumer.config") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("CONSUMER_CONFIG") @@ -165,7 +237,7 @@ private static ArgumentParser argParser() { .help("Consumer config properties file."); parser.addArgument("--report-interval-sec") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("REPORT_INTERVAL_SEC") @@ -173,7 +245,7 @@ private static ArgumentParser argParser() { .help("Interval in sec with which to export stats"); parser.addArgument("--record-delay-ms") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("RECORD_DELAY_MS") @@ -181,7 +253,7 @@ private static ArgumentParser argParser() { .help("The delay in ms before sending next record to the same partition"); parser.addArgument("--latency-percentile-max-ms") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("LATENCY_PERCENTILE_MAX_MS") @@ -190,7 +262,7 @@ private static ArgumentParser argParser() { "The percentile will be reported as Double.POSITIVE_INFINITY if its value exceeds the max value."); parser.addArgument("--latency-percentile-granularity-ms") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(String.class) .metavar("LATENCY_PERCENTILE_GRANULARITY_MS") @@ -198,15 +270,31 @@ private static ArgumentParser argParser() { .help("The granularity in ms of latency percentile metric. This is the width of the bucket used in percentile calculation."); parser.addArgument("--topic-creation-enabled") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(Boolean.class) .metavar("AUTO_TOPIC_CREATION_ENABLED") .dest("autoTopicCreationEnabled") .help(TopicManagementServiceConfig.TOPIC_CREATION_ENABLED_DOC); + parser.addArgument("--topic-add-partition-enabled") + .action(net.sourceforge.argparse4j.impl.Arguments.store()) + .required(false) + .type(Boolean.class) + .metavar("TOPIC_ADD_PARTITION_ENABLED") + .dest("topicAddPartitionEnabled") + .help(TopicManagementServiceConfig.TOPIC_ADD_PARTITION_ENABLED_DOC); + + parser.addArgument("--topic-reassign-partition-and-elect-leader-enabled") + .action(net.sourceforge.argparse4j.impl.Arguments.store()) + .required(false) + .type(Boolean.class) + .metavar("TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED") + .dest("topicReassignPartitionAndElectLeaderEnabled") + .help(TopicManagementServiceConfig.TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_DOC); + parser.addArgument("--replication-factor") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(Integer.class) .metavar("REPLICATION_FACTOR") @@ -214,13 +302,21 @@ private static ArgumentParser argParser() { .help(TopicManagementServiceConfig.TOPIC_REPLICATION_FACTOR_DOC); parser.addArgument("--topic-rebalance-interval-ms") - .action(store()) + .action(net.sourceforge.argparse4j.impl.Arguments.store()) .required(false) .type(Integer.class) .metavar("REBALANCE_MS") .dest("rebalanceMs") .help(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_DOC); + parser.addArgument("--topic-preferred-leader-election-interval-ms") + .action(net.sourceforge.argparse4j.impl.Arguments.store()) + .required(false) + .type(Integer.class) + .metavar("PREFERED_LEADER_ELECTION_INTERVAL_MS") + .dest("preferredLeaderElectionIntervalMs") + .help(MultiClusterTopicManagementServiceConfig.PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_DOC); + return parser; } @@ -232,9 +328,7 @@ public static void main(String[] args) throws Exception { } Namespace res = parser.parseArgs(args); - Map props = new HashMap<>(); - // produce service config props.put(ProduceServiceConfig.ZOOKEEPER_CONNECT_CONFIG, res.getString("zkConnect")); props.put(ProduceServiceConfig.BOOTSTRAP_SERVERS_CONFIG, res.getString("brokerList")); @@ -266,11 +360,16 @@ public static void main(String[] args) throws Exception { // topic management service config if (res.getBoolean("autoTopicCreationEnabled") != null) props.put(TopicManagementServiceConfig.TOPIC_CREATION_ENABLED_CONFIG, res.getBoolean("autoTopicCreationEnabled")); + if (res.getBoolean("topicAddPartitionEnabled") != null) + props.put(TopicManagementServiceConfig.TOPIC_ADD_PARTITION_ENABLED_CONFIG, res.getBoolean("topicAddPartitionEnabled")); + if (res.getBoolean("topicReassignPartitionAndElectLeaderEnabled") != null) + props.put(TopicManagementServiceConfig.TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_CONFIG, res.getBoolean("topicReassignPartitionAndElectLeaderEnabled")); if (res.getInt("replicationFactor") != null) props.put(TopicManagementServiceConfig.TOPIC_REPLICATION_FACTOR_CONFIG, res.getInt("replicationFactor")); if (res.getInt("rebalanceMs") != null) props.put(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG, res.getInt("rebalanceMs")); - + if (res.getLong("preferredLeaderElectionIntervalMs") != null) + props.put(MultiClusterTopicManagementServiceConfig.PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG, res.getLong("preferredLeaderElectionIntervalMs")); SingleClusterMonitor app = new SingleClusterMonitor(props, "single-cluster-monitor"); app.start(); @@ -279,31 +378,49 @@ public static void main(String[] args) throws Exception { if (res.getString("reportIntervalSec") != null) props.put(DefaultMetricsReporterServiceConfig.REPORT_INTERVAL_SEC_CONFIG, res.getString("reportIntervalSec")); List metrics = Arrays.asList( + "kmf.services:type=consume-service,name=*:topic-partitions-count", "kmf.services:type=produce-service,name=*:produce-availability-avg", "kmf.services:type=consume-service,name=*:consume-availability-avg", "kmf.services:type=produce-service,name=*:records-produced-total", "kmf.services:type=consume-service,name=*:records-consumed-total", "kmf.services:type=consume-service,name=*:records-lost-total", + "kmf.services:type=consume-service,name=*:records-lost-rate", "kmf.services:type=consume-service,name=*:records-duplicated-total", "kmf.services:type=consume-service,name=*:records-delay-ms-avg", "kmf.services:type=produce-service,name=*:records-produced-rate", "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate"); + "kmf.services:type=consume-service,name=*:consume-error-rate", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-total", + "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total", + "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th", + "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th", + "kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate", + "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total", + + "kmf.services:type=" + ClusterTopicManipulationMetrics.METRIC_GROUP_NAME + + ",name=*:topic-creation-metadata-propagation-ms-avg", + "kmf.services:type=" + ClusterTopicManipulationMetrics.METRIC_GROUP_NAME + + ",name=*:topic-creation-metadata-propagation-ms-max", + "kmf.services:type=" + ClusterTopicManipulationMetrics.METRIC_GROUP_NAME + + ",name=*:topic-deletion-metadata-propagation-ms-avg", + "kmf.services:type=" + ClusterTopicManipulationMetrics.METRIC_GROUP_NAME + + ",name=*:topic-deletion-metadata-propagation-ms-max" + ); + props.put(DefaultMetricsReporterServiceConfig.REPORT_METRICS_CONFIG, metrics); DefaultMetricsReporterService metricsReporterService = new DefaultMetricsReporterService(props, "end-to-end"); metricsReporterService.start(); - JolokiaService jolokiaService = new JolokiaService(new HashMap(), "end-to-end"); + JolokiaService jolokiaService = new JolokiaService(new HashMap<>(), "end-to-end"); jolokiaService.start(); - - JettyService jettyService = new JettyService(new HashMap(), "end-to-end"); - jettyService.start(); - - if (!app.isRunning()) { - LOG.error("Some services have stopped"); - System.exit(-1); - } - app.awaitShutdown(); } } diff --git a/src/main/java/com/linkedin/kmf/apps/configs/MultiClusterMonitorConfig.java b/src/main/java/com/linkedin/xinfra/monitor/apps/configs/MultiClusterMonitorConfig.java similarity index 86% rename from src/main/java/com/linkedin/kmf/apps/configs/MultiClusterMonitorConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/apps/configs/MultiClusterMonitorConfig.java index 9fe128c7..6abefd48 100644 --- a/src/main/java/com/linkedin/kmf/apps/configs/MultiClusterMonitorConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/apps/configs/MultiClusterMonitorConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -8,10 +8,10 @@ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.apps.configs; +package com.linkedin.xinfra.monitor.apps.configs; -import com.linkedin.kmf.services.configs.CommonServiceConfig; -import com.linkedin.kmf.services.configs.MultiClusterTopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.CommonServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig; import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; @@ -48,4 +48,5 @@ public Double getDouble(String key) { public MultiClusterMonitorConfig(Map props) { super(CONFIG, props); } + } diff --git a/src/main/java/com/linkedin/xinfra/monitor/common/ConfigDocumentationGenerator.java b/src/main/java/com/linkedin/xinfra/monitor/common/ConfigDocumentationGenerator.java new file mode 100644 index 00000000..a82afd9b --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/common/ConfigDocumentationGenerator.java @@ -0,0 +1,69 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.common; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.lang.reflect.Field; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Generates the table of configuration parameters, their documentation strings and default values. + */ +public class ConfigDocumentationGenerator { + private static final Logger LOG = LoggerFactory.getLogger(ConfigDocumentationGenerator.class); + + private static void printHelp() { + System.out.println("ConfigDocumentationGenerator outputDirectory configClassNames..."); + } + + private static void printHtmlHeader(Writer out, String docClass) throws IOException { + out.write("Kafka Monitoring Automatically Generated Documentation. \n"); + out.write("

"); + out.write(docClass); + out.write("

\n"); + } + private static void printHtmlFooter(Writer out) throws IOException { + out.write("\n\n"); + } + + public static void main(String[] argv) throws Exception { + if (argv.length < 2) { + printHelp(); + System.exit(1); + } + + File outputDir = new File(argv[0]); + if (!outputDir.exists()) { + outputDir.mkdirs(); + } + + for (int i = 1; i < argv.length; i++) { + Class configClass = (Class) Class.forName(argv[i]); + Field configDefField = configClass.getDeclaredField("CONFIG"); + configDefField.setAccessible(true); + ConfigDef configDef = (ConfigDef) configDefField.get(null); + String docClass = configClass.getSimpleName(); + File outputFile = new File(outputDir, docClass + ".html"); + try (FileWriter fout = new FileWriter(outputFile)) { + printHtmlHeader(fout, docClass); + fout.write(configDef.toHtmlTable()); + printHtmlFooter(fout); + } + } + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/common/ConsumerGroupCoordinatorUtils.java b/src/main/java/com/linkedin/xinfra/monitor/common/ConsumerGroupCoordinatorUtils.java new file mode 100644 index 00000000..897b4a70 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/common/ConsumerGroupCoordinatorUtils.java @@ -0,0 +1,85 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.common; + +import com.linkedin.xinfra.monitor.consumer.NewConsumer; +import java.util.Collections; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.common.internals.Topic; +import org.apache.kafka.common.utils.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class ConsumerGroupCoordinatorUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(NewConsumer.class); + private static final String CONSUMER_GROUP_PREFIX_CANDIDATE = "__shadow_consumer_group-"; + + /** + * https://github.com/apache/kafka/blob/trunk/core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala#L189 + * The consumer group string's hash code is used for this modulo operation. + * @param groupId kafka consumer group ID + * @param consumerOffsetsTopicPartitions number of partitions in the __consumer_offsets topic. + * @return hashed integer which represents a number, the Kafka's Utils.abs() value of which is the broker + * ID of the group coordinator, or the leader of the offsets topic partition. + */ + public static int partitionFor(String groupId, int consumerOffsetsTopicPartitions) { + + LOGGER.debug("Hashed and modulo output: {}", groupId.hashCode()); + return Utils.abs(groupId.hashCode()) % consumerOffsetsTopicPartitions; + } + + /** + * Instead of making targetGroupId an instance variable and then assigning it some value which this then looks up + * it can just be a parameter to a method + * hash(group.id) % (number of __consumer_offsets topic partitions). + * The partition's leader is the group coordinator + * Choose B s.t hash(A) % (number of __consumer_offsets topic partitions) == hash(B) % (number of __consumer_offsets topic partitions) + * @param targetGroupId the identifier of the target consumer group + * @param adminClient an Admin Client object + */ + public static String findCollision(String targetGroupId, AdminClient adminClient) + throws ExecutionException, InterruptedException { + if (targetGroupId.equals("")) { + throw new IllegalArgumentException("The target consumer group identifier cannot be empty: " + targetGroupId); + } + + int numOffsetsTopicPartitions = adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)) + .values() + .get(Topic.GROUP_METADATA_TOPIC_NAME) + .get() + .partitions() + .size(); + + // Extract invariant from loop + int targetConsumerOffsetsPartition = partitionFor(targetGroupId, numOffsetsTopicPartitions); + + // This doesn't need to be an instance variable because we throw this out this value at the end of computation + int groupSuffix = 0; + + // Extract return value so it's not computed twice, this reduces the possibility of bugs + String newConsumerGroup; + + // Use while(true) otherwise halting condition is hard to read. + while (true) { + // TODO: could play fancy StringBuilder games here to make this generate less garbage + newConsumerGroup = CONSUMER_GROUP_PREFIX_CANDIDATE + groupSuffix++; + int newGroupNamePartition = ConsumerGroupCoordinatorUtils.partitionFor(newConsumerGroup, numOffsetsTopicPartitions); + if (newGroupNamePartition == targetConsumerOffsetsPartition) { + break; + } + } + + return newConsumerGroup; + } +} + diff --git a/src/main/java/com/linkedin/kmf/common/DefaultTopicSchema.java b/src/main/java/com/linkedin/xinfra/monitor/common/DefaultTopicSchema.java similarity index 70% rename from src/main/java/com/linkedin/kmf/common/DefaultTopicSchema.java rename to src/main/java/com/linkedin/xinfra/monitor/common/DefaultTopicSchema.java index e248807a..cb1dc34c 100644 --- a/src/main/java/com/linkedin/kmf/common/DefaultTopicSchema.java +++ b/src/main/java/com/linkedin/xinfra/monitor/common/DefaultTopicSchema.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.common; + +package com.linkedin.xinfra.monitor.common; import java.util.Arrays; import org.apache.avro.Schema; @@ -15,17 +16,17 @@ public class DefaultTopicSchema { - public static final Field TOPIC_FIELD = new Field("topic", Schema.create(Schema.Type.STRING), null, null); + static final Field TOPIC_FIELD = new Field("topic", Schema.create(Schema.Type.STRING), null, null); public static final Field TIME_FIELD = new Field("time", Schema.create(Schema.Type.LONG), null, null); public static final Field INDEX_FIELD = new Field("index", Schema.create(Schema.Type.LONG), null, null); - public static final Field PRODUCER_ID_FIELD = new Field("producerId", Schema.create(Schema.Type.STRING), null, null); + static final Field PRODUCER_ID_FIELD = new Field("producerId", Schema.create(Schema.Type.STRING), null, null); - public static final Field CONTENT_FIELD = new Field("content", Schema.create(Schema.Type.STRING), null, null); + static final Field CONTENT_FIELD = new Field("content", Schema.create(Schema.Type.STRING), null, null); - public static final Schema MESSAGE_V0; + static final Schema MESSAGE_V0; static { MESSAGE_V0 = Schema.createRecord("KafkaMonitorSchema", null, "kafka.monitor", false); diff --git a/src/main/java/com/linkedin/kmf/common/MbeanAttributeValue.java b/src/main/java/com/linkedin/xinfra/monitor/common/MbeanAttributeValue.java similarity index 89% rename from src/main/java/com/linkedin/kmf/common/MbeanAttributeValue.java rename to src/main/java/com/linkedin/xinfra/monitor/common/MbeanAttributeValue.java index 3dff36cf..35c6f511 100644 --- a/src/main/java/com/linkedin/kmf/common/MbeanAttributeValue.java +++ b/src/main/java/com/linkedin/xinfra/monitor/common/MbeanAttributeValue.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.common; + +package com.linkedin.xinfra.monitor.common; public class MbeanAttributeValue { private final String _mbean; diff --git a/src/main/java/com/linkedin/xinfra/monitor/common/Utils.java b/src/main/java/com/linkedin/xinfra/monitor/common/Utils.java new file mode 100644 index 00000000..d920437d --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/common/Utils.java @@ -0,0 +1,273 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.common; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.linkedin.avroutil1.compatibility.AvroCodecUtil; +import com.linkedin.avroutil1.compatibility.AvroCompatibilityHelper; +import com.linkedin.avroutil1.compatibility.AvroVersion; +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import javax.management.MBeanAttributeInfo; +import javax.management.MBeanInfo; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import kafka.admin.BrokerMetadata; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.Decoder; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.CreateTopicsResult; +import org.apache.kafka.clients.admin.ListPartitionReassignmentsResult; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.admin.PartitionReassignment; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.errors.TopicExistsException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Xinfra Monitor utilities. + */ +public class Utils { + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + public static final int ZK_CONNECTION_TIMEOUT_MS = 30_000; + public static final int ZK_SESSION_TIMEOUT_MS = 30_000; + private static final long LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS = 60000L; + private static final int LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS = 3; + private static final String LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS_CONFIG = "list.partition.reassignment.timeout.ms"; + private static final int DEFAULT_RETRY_BACKOFF_BASE = 2; + + public static String prettyPrint(Object value) throws JsonProcessingException { + ObjectMapper objectMapper = new ObjectMapper(); + ObjectWriter objectWriter = objectMapper.writerWithDefaultPrettyPrinter(); + String written = objectWriter.writeValueAsString(value); + LOG.trace("pretty printed: {}", written); + + return written; + } + + /** + * Retrieve the map of {@link PartitionReassignment reassignment} by {@link TopicPartition partitions}. + * + * If the response times out, the method retries up to {@link #LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS} times. + * The max time to wait for the {@link AdminClient adminClient} response is computed. + * + * @param adminClient The {@link AdminClient adminClient} to ask for ongoing partition reassignments + * @return The map of {@link PartitionReassignment reassignment} by {@link TopicPartition partitions} + */ + public static Map ongoingPartitionReassignments(AdminClient adminClient) + throws InterruptedException, ExecutionException, TimeoutException { + Map partitionReassignments = null; + int attempts = 0; + long timeoutMs = LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS; + do { + ListPartitionReassignmentsResult responseResult = adminClient.listPartitionReassignments(); + try { + // A successful response is expected to be non-null. + partitionReassignments = responseResult.reassignments().get(timeoutMs, TimeUnit.MILLISECONDS); + } catch (TimeoutException timeoutException) { + LOG.info( + "Xinfra Monitor has failed to list partition reassignments in {}ms (attempt={}). " + + "Please consider increasing the value of {} config.", + timeoutMs, 1 + attempts, LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS_CONFIG); + attempts++; + if (attempts == LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS) { + throw timeoutException; + } + timeoutMs *= DEFAULT_RETRY_BACKOFF_BASE; + } + } while (partitionReassignments == null); + + return partitionReassignments; + } + + public static List replicaIdentifiers(Set brokers) { + if (brokers == null || brokers.size() == 0) { + throw new IllegalArgumentException("brokers are either null or empty."); + } + + List brokerMetadataList = new ArrayList<>(brokers); + + // Shuffle to get a random order in the replica list + Collections.shuffle(brokerMetadataList); + + // Get broker ids for replica list + List replicaList = brokerMetadataList.stream().map(m -> m.id()).collect(Collectors.toList()); + + return replicaList; + } + + /** + * Read number of partitions for the given topic on the specified ZooKeeper + * @param adminClient AdminClient object initialized. + * @param topic topic name. + * @return the number of partitions of the given topic + * @throws ExecutionException thrown when describeTopics(topics) get(topic) execution fails. + * @throws InterruptedException thrown when adminClient's describeTopics getTopic is interrupted. + */ + private static int getPartitionNumForTopic(AdminClient adminClient, String topic) + throws ExecutionException, InterruptedException { + try { + return adminClient.describeTopics(Collections.singleton(topic)).values().get(topic).get().partitions().size(); + } catch (NoSuchElementException e) { + return 0; + } finally { + LOG.info("Finished getPartitionNumForTopic."); + } + } + + /** + * Create the topic. This method attempts to create a topic so that all + * the brokers in the cluster will have partitionToBrokerRatio partitions. If the topic exists, but has different parameters + * then this does nothing to update the parameters. + * + * TODO: Do we care about rack aware mode? I would think no because we want to spread the topic over all brokers. + * @param topic topic name + * @param replicationFactor the replication factor for the topic + * @param partitionToBrokerRatio This is multiplied by the number brokers to compute the number of partitions in the topic. + * @param minPartitionNum partition number to be created at least + * @param topicConfig additional parameters for the topic for example min.insync.replicas + * @param adminClient AdminClient object initialized. + * @return the number of partitions created + * @throws ExecutionException exception thrown then executing the topic creation fails. + * @throws InterruptedException exception that's thrown when interrupt occurs. + */ + @SuppressWarnings("unchecked") + public static int createTopicIfNotExists(String topic, short replicationFactor, double partitionToBrokerRatio, + int minPartitionNum, Properties topicConfig, AdminClient adminClient) + throws ExecutionException, InterruptedException { + try { + if (adminClient.listTopics().names().get().contains(topic)) { + LOG.info("AdminClient indicates that topic {} already exists in the cluster. Topic config: {}", topic, topicConfig); + return getPartitionNumForTopic(adminClient, topic); + } + int brokerCount = Utils.getBrokerCount(adminClient); + int partitionCount = Math.max((int) Math.ceil(brokerCount * partitionToBrokerRatio), minPartitionNum); + try { + NewTopic newTopic = new NewTopic(topic, partitionCount, replicationFactor); + //noinspection rawtypes + newTopic.configs((Map) topicConfig); + + List topics = new ArrayList<>(); + topics.add(newTopic); + CreateTopicsResult result = adminClient.createTopics(topics); + + // waits for this topic creation future to complete, and then returns its result. + result.values().get(topic).get(); + LOG.info("CreateTopicsResult: {}.", result.values()); + } catch (TopicExistsException e) { + /* There is a race condition with the consumer. */ + LOG.info("Monitoring topic " + topic + " already exists in the cluster.", e); + return getPartitionNumForTopic(adminClient, topic); + } + LOG.info("Created monitoring topic {} in cluster with {} partitions and replication factor of {}.", topic, + partitionCount, replicationFactor); + + return partitionCount; + } finally { + LOG.info("Completed the topic creation if it doesn't exist for {}.", topic); + } + } + + /** + * @return the number of brokers in this cluster + */ + private static int getBrokerCount(AdminClient adminClient) throws ExecutionException, InterruptedException { + return adminClient.describeCluster().nodes().get().size(); + } + + /** + * @param timestamp time in Ms when this message is generated + * @param topic topic this message is sent to + * @param idx index is consecutive numbers used by XinfraMonitor to determine duplicate or lost messages + * @param msgSize size of the message + * @return string that encodes the above fields + */ + public static String jsonFromFields(String topic, long idx, long timestamp, String producerId, int msgSize) { + GenericRecord record = new GenericData.Record(DefaultTopicSchema.MESSAGE_V0); + record.put(DefaultTopicSchema.TOPIC_FIELD.name(), topic); + record.put(DefaultTopicSchema.INDEX_FIELD.name(), idx); + record.put(DefaultTopicSchema.TIME_FIELD.name(), timestamp); + record.put(DefaultTopicSchema.PRODUCER_ID_FIELD.name(), producerId); + // CONTENT_FIELD is composed of #msgSize number of character 'x', e.g. xxxxxxxxxx + record.put(DefaultTopicSchema.CONTENT_FIELD.name(), String.format("%1$-" + msgSize + "s", "").replace(' ', 'x')); + return jsonFromGenericRecord(record); + } + + /** + * @param message kafka message in the string format + * @return GenericRecord that is de-serialized from kafka message w.r.t. expected schema. + */ + public static GenericRecord genericRecordFromJson(String message) { + try { + Decoder jsonDecoder = AvroCompatibilityHelper.newCompatibleJsonDecoder(DefaultTopicSchema.MESSAGE_V0, message); + GenericDatumReader reader = new GenericDatumReader<>(DefaultTopicSchema.MESSAGE_V0, DefaultTopicSchema.MESSAGE_V0); + return reader.read(null, jsonDecoder); + } catch (Exception e) { + throw new IllegalStateException("unable to deserialize " + message, e); + } + } + + public static String jsonFromGenericRecord(GenericRecord record) { + try { + return AvroCodecUtil.serializeJson(record, AvroVersion.AVRO_1_4); + } catch (IOException e) { + throw new IllegalStateException("Unable to serialize avro record due to error: " + record, e); + } + } + + public static List getMBeanAttributeValues(String mbeanExpr, String attributeExpr) { + List values = new ArrayList<>(); + MBeanServer server = ManagementFactory.getPlatformMBeanServer(); + try { + Set mbeanNames = server.queryNames(new ObjectName(mbeanExpr), null); + for (ObjectName mbeanName : mbeanNames) { + MBeanInfo mBeanInfo = server.getMBeanInfo(mbeanName); + MBeanAttributeInfo[] attributeInfos = mBeanInfo.getAttributes(); + for (MBeanAttributeInfo attributeInfo : attributeInfos) { + if (attributeInfo.getName().equals(attributeExpr) || attributeExpr.length() == 0 || attributeExpr.equals( + "*")) { + double value = (Double) server.getAttribute(mbeanName, attributeInfo.getName()); + values.add(new MbeanAttributeValue(mbeanName.getCanonicalName(), attributeInfo.getName(), value)); + } + } + } + } catch (Exception e) { + LOG.error("fail to retrieve value for " + mbeanExpr + ":" + attributeExpr, e); + } + return values; + } + + public static void delay(Duration duration) { + try { + Thread.sleep(duration.toMillis()); + } catch (InterruptedException e) { + LOG.warn("While trying to sleep for {} millis. Got:", duration.toMillis(), e); + } + } +} diff --git a/src/main/java/com/linkedin/kmf/consumer/BaseConsumerRecord.java b/src/main/java/com/linkedin/xinfra/monitor/consumer/BaseConsumerRecord.java similarity index 91% rename from src/main/java/com/linkedin/kmf/consumer/BaseConsumerRecord.java rename to src/main/java/com/linkedin/xinfra/monitor/consumer/BaseConsumerRecord.java index 17ce0c75..7b8f514b 100644 --- a/src/main/java/com/linkedin/kmf/consumer/BaseConsumerRecord.java +++ b/src/main/java/com/linkedin/xinfra/monitor/consumer/BaseConsumerRecord.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.consumer; + +package com.linkedin.xinfra.monitor.consumer; public class BaseConsumerRecord { diff --git a/src/main/java/com/linkedin/kmf/consumer/KMBaseConsumer.java b/src/main/java/com/linkedin/xinfra/monitor/consumer/KMBaseConsumer.java similarity index 53% rename from src/main/java/com/linkedin/kmf/consumer/KMBaseConsumer.java rename to src/main/java/com/linkedin/xinfra/monitor/consumer/KMBaseConsumer.java index 84a011eb..a67a65dc 100644 --- a/src/main/java/com/linkedin/kmf/consumer/KMBaseConsumer.java +++ b/src/main/java/com/linkedin/xinfra/monitor/consumer/KMBaseConsumer.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,18 +7,37 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.consumer; + +package com.linkedin.xinfra.monitor.consumer; + +import java.util.Map; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.TopicPartition; + /** * A base consumer used to abstract different consumer classes. * - * Implementations of this class must have constructor with the following signature:
+ * Implementations of this class must have constructor with the following signature: * Constructor({@link java.util.Properties} properties). */ public interface KMBaseConsumer { BaseConsumerRecord receive() throws Exception; + void commitAsync(); + + void commitAsync(final Map offsets, OffsetCommitCallback callback); + + void commitAsync(OffsetCommitCallback callback); + + OffsetAndMetadata committed(TopicPartition tp); + void close(); -} \ No newline at end of file + long lastCommitted(); + + void updateLastCommit(); + +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumer.java b/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumer.java new file mode 100644 index 00000000..e958d43c --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumer.java @@ -0,0 +1,105 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.consumer; + +import com.linkedin.xinfra.monitor.common.ConsumerGroupCoordinatorUtils; +import java.time.Duration; +import java.util.Collections; +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Wraps around the new consumer from Apache Kafka and implements the #KMBaseConsumer interface + */ +public class NewConsumer implements KMBaseConsumer { + + private final KafkaConsumer _consumer; + private Iterator> _recordIter; + private static final Logger LOGGER = LoggerFactory.getLogger(NewConsumer.class); + private static long lastCommitted; + + public NewConsumer(String topic, Properties consumerProperties, AdminClient adminClient) + throws ExecutionException, InterruptedException { + LOGGER.info("{} is being instantiated in the constructor..", this.getClass().getSimpleName()); + + NewConsumerConfig newConsumerConfig = new NewConsumerConfig(consumerProperties); + String targetConsumerGroupId = newConsumerConfig.getString(NewConsumerConfig.TARGET_CONSUMER_GROUP_ID_CONFIG); + + if (targetConsumerGroupId != null) { + consumerProperties.put(ConsumerConfig.GROUP_ID_CONFIG, configureGroupId(targetConsumerGroupId, adminClient)); + } + _consumer = new KafkaConsumer<>(consumerProperties); + _consumer.subscribe(Collections.singletonList(topic)); + } + + static String configureGroupId(String targetConsumerGroupId, AdminClient adminClient) + throws ExecutionException, InterruptedException { + + return ConsumerGroupCoordinatorUtils.findCollision(targetConsumerGroupId, adminClient); + } + + @Override + public BaseConsumerRecord receive() { + if (_recordIter == null || !_recordIter.hasNext()) { + _recordIter = _consumer.poll(Duration.ofMillis(Long.MAX_VALUE)).iterator(); + } + + ConsumerRecord record = _recordIter.next(); + return new BaseConsumerRecord(record.topic(), record.partition(), record.offset(), record.key(), record.value()); + } + + @Override + public void commitAsync() { + _consumer.commitAsync(); + } + + @Override + public void commitAsync(final Map offsets, OffsetCommitCallback callback) { + _consumer.commitAsync(offsets, callback); + } + + @Override + public void commitAsync(OffsetCommitCallback callback) { + _consumer.commitAsync(callback); + } + + @Override + public OffsetAndMetadata committed(TopicPartition tp) { + return _consumer.committed(tp); + } + + @Override + public void close() { + _consumer.close(); + } + + @Override + public long lastCommitted() { + return lastCommitted; + } + + @Override + public void updateLastCommit() { + lastCommitted = System.currentTimeMillis(); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumerConfig.java b/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumerConfig.java new file mode 100644 index 00000000..0526c022 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/consumer/NewConsumerConfig.java @@ -0,0 +1,41 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +package com.linkedin.xinfra.monitor.consumer; + +import java.util.Map; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + + +/** + * Configuration for Xinfra Monitor New Consumer + */ +public class NewConsumerConfig extends AbstractConfig { + + private static final ConfigDef CONFIG_DEF; + + public static final String TARGET_CONSUMER_GROUP_ID_CONFIG = "target.consumer.group.id"; + public static final String TARGET_CONSUMER_GROUP_ID_CONFIG_DOC = + "When defined a consumer group is chosen such that it maps to the same group coordinator as the specified " + + "group coordinator."; + + static { + CONFIG_DEF = new ConfigDef().define(TARGET_CONSUMER_GROUP_ID_CONFIG, + ConfigDef.Type.STRING, + null, + ConfigDef.Importance.MEDIUM, + TARGET_CONSUMER_GROUP_ID_CONFIG_DOC); + } + + public NewConsumerConfig(Map props) { + super(CONFIG_DEF, props); + } +} + diff --git a/src/main/java/com/linkedin/kmf/partitioner/KMPartitioner.java b/src/main/java/com/linkedin/xinfra/monitor/partitioner/KMPartitioner.java similarity index 81% rename from src/main/java/com/linkedin/kmf/partitioner/KMPartitioner.java rename to src/main/java/com/linkedin/xinfra/monitor/partitioner/KMPartitioner.java index 839e3d77..36c0f5f6 100644 --- a/src/main/java/com/linkedin/kmf/partitioner/KMPartitioner.java +++ b/src/main/java/com/linkedin/xinfra/monitor/partitioner/KMPartitioner.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.partitioner; + +package com.linkedin.xinfra.monitor.partitioner; public interface KMPartitioner { diff --git a/src/main/java/com/linkedin/kmf/partitioner/NewKMPartitioner.java b/src/main/java/com/linkedin/xinfra/monitor/partitioner/NewKMPartitioner.java similarity index 75% rename from src/main/java/com/linkedin/kmf/partitioner/NewKMPartitioner.java rename to src/main/java/com/linkedin/xinfra/monitor/partitioner/NewKMPartitioner.java index e73a4119..12f8a59d 100644 --- a/src/main/java/com/linkedin/kmf/partitioner/NewKMPartitioner.java +++ b/src/main/java/com/linkedin/xinfra/monitor/partitioner/NewKMPartitioner.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,18 +7,17 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.partitioner; - -import static org.apache.kafka.common.utils.Utils.murmur2; +package com.linkedin.xinfra.monitor.partitioner; public class NewKMPartitioner implements KMPartitioner { public int partition(String key, int partitionNum) { byte[] keyBytes = key.getBytes(); - return toPositive(murmur2(keyBytes)) % partitionNum; + return toPositive(org.apache.kafka.common.utils.Utils.murmur2(keyBytes)) % partitionNum; } private static int toPositive(int number) { return number & 0x7fffffff; } + } diff --git a/src/main/java/com/linkedin/kmf/producer/BaseProducerRecord.java b/src/main/java/com/linkedin/xinfra/monitor/producer/BaseProducerRecord.java similarity index 90% rename from src/main/java/com/linkedin/kmf/producer/BaseProducerRecord.java rename to src/main/java/com/linkedin/xinfra/monitor/producer/BaseProducerRecord.java index 22548383..41c7f05d 100644 --- a/src/main/java/com/linkedin/kmf/producer/BaseProducerRecord.java +++ b/src/main/java/com/linkedin/xinfra/monitor/producer/BaseProducerRecord.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.producer; + +package com.linkedin.xinfra.monitor.producer; public class BaseProducerRecord { private final String _topic; diff --git a/src/main/java/com/linkedin/kmf/producer/KMBaseProducer.java b/src/main/java/com/linkedin/xinfra/monitor/producer/KMBaseProducer.java similarity index 87% rename from src/main/java/com/linkedin/kmf/producer/KMBaseProducer.java rename to src/main/java/com/linkedin/xinfra/monitor/producer/KMBaseProducer.java index e2ecade6..26635833 100644 --- a/src/main/java/com/linkedin/kmf/producer/KMBaseProducer.java +++ b/src/main/java/com/linkedin/xinfra/monitor/producer/KMBaseProducer.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.producer; + +package com.linkedin.xinfra.monitor.producer; import org.apache.kafka.clients.producer.RecordMetadata; @@ -23,4 +24,4 @@ public interface KMBaseProducer { void close(); -} \ No newline at end of file +} diff --git a/src/main/java/com/linkedin/kmf/producer/NewProducer.java b/src/main/java/com/linkedin/xinfra/monitor/producer/NewProducer.java similarity index 92% rename from src/main/java/com/linkedin/kmf/producer/NewProducer.java rename to src/main/java/com/linkedin/xinfra/monitor/producer/NewProducer.java index 7379dc7b..f9b9f9de 100644 --- a/src/main/java/com/linkedin/kmf/producer/NewProducer.java +++ b/src/main/java/com/linkedin/xinfra/monitor/producer/NewProducer.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.producer; + +package com.linkedin.xinfra.monitor.producer; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerRecord; diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/AbstractService.java b/src/main/java/com/linkedin/xinfra/monitor/services/AbstractService.java new file mode 100644 index 00000000..45d88317 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/AbstractService.java @@ -0,0 +1,80 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.common.Utils; +import java.time.Duration; +import java.util.Collections; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.KafkaFuture; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public abstract class AbstractService implements Service { + + private static final Logger LOG = LoggerFactory.getLogger(AbstractService.class); + // Below fields are used for the topic description retry logic since sometimes it takes a while for the admin clint + // to discover a topic due to the fact that Kafka's metadata is eventually consistent. The retry logic is particularly + // helpful to avoid exceptions when a new topic gets created since it takes even longer for the admin client to discover + // the newly created topic + private final int _describeTopicRetries; + private final Duration _describeTopicRetryInterval; + + AbstractService(int describeTopicRetries, Duration describeTopicRetryInterval) { + if (describeTopicRetries < 1) { + throw new IllegalArgumentException("Expect retry greater 0. Got: " + describeTopicRetries); + } + _describeTopicRetries = describeTopicRetries; + _describeTopicRetryInterval = describeTopicRetryInterval; + } + + TopicDescription getTopicDescription(AdminClient adminClient, String topic) { + int attemptCount = 0; + TopicDescription topicDescription = null; + Exception exception = null; + + while (attemptCount < _describeTopicRetries) { + DescribeTopicsResult describeTopicsResult = adminClient.describeTopics(Collections.singleton(topic)); + Map> topicResultValues = describeTopicsResult.values(); + KafkaFuture topicDescriptionKafkaFuture = topicResultValues.get(topic); + topicDescription = null; + exception = null; + try { + topicDescription = topicDescriptionKafkaFuture.get(); + } catch (InterruptedException | ExecutionException e) { + exception = e; + } + if (exception != null) { + LOG.error("Exception occurred while getting the topicDescriptionKafkaFuture for topic: {} at attempt {}", topic, + attemptCount, exception); + } else if (topicDescription == null) { + LOG.warn("Got null description for topic {} at attempt {}", topic, attemptCount); + } else { + return topicDescription; + } + attemptCount++; + if (attemptCount < _describeTopicRetries) { + Utils.delay(_describeTopicRetryInterval); + } + } + + if (exception != null) { + throw new IllegalStateException(exception); + } else { + throw new IllegalStateException(String.format("Got null description for topic %s after %d retry(s)", topic, _describeTopicRetries)); + } + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationService.java b/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationService.java new file mode 100644 index 00000000..56c3ddc2 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationService.java @@ -0,0 +1,391 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.XinfraMonitorConstants; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.metrics.ClusterTopicManipulationMetrics; +import com.linkedin.xinfra.monitor.topicfactory.TopicFactory; +import java.lang.reflect.InvocationTargetException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import kafka.admin.BrokerMetadata; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.CreateTopicsResult; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.KafkaFuture; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.metrics.JmxReporter; +import org.apache.kafka.common.metrics.MetricConfig; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.MetricsReporter; +import org.apache.kafka.common.requests.DescribeLogDirsResponse; +import org.apache.kafka.common.utils.SystemTime; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Service monitoring the creations and deletions of Kafka Cluster's Topic. + */ +public class ClusterTopicManipulationService implements Service { + + private static final Logger LOGGER = LoggerFactory.getLogger(ClusterTopicManipulationService.class); + private final String _configDefinedServiceName; + private final Duration _reportIntervalSecond; + private final ScheduledExecutorService _executor; + private final AdminClient _adminClient; + private boolean _isOngoingTopicCreationDone; + private boolean _isOngoingTopicDeletionDone; + private final AtomicBoolean _running; + private String _currentlyOngoingTopic; + int _expectedPartitionsCount; + + private final ClusterTopicManipulationMetrics _clusterTopicManipulationMetrics; + private final TopicFactory _topicFactory; + private final String _zkConnect; + + public ClusterTopicManipulationService(String name, AdminClient adminClient, Map props) + throws ClassNotFoundException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, + InstantiationException { + LOGGER.info("ClusterTopicManipulationService constructor initiated {}", this.getClass().getName()); + + _isOngoingTopicCreationDone = true; + _isOngoingTopicDeletionDone = true; + _adminClient = adminClient; + _executor = Executors.newSingleThreadScheduledExecutor(); + _reportIntervalSecond = Duration.ofSeconds(1); + _running = new AtomicBoolean(false); + _configDefinedServiceName = name; + + MetricConfig metricConfig = new MetricConfig().samples(60).timeWindow(1000, TimeUnit.MILLISECONDS); + List reporters = new ArrayList<>(); + reporters.add(new JmxReporter(Service.JMX_PREFIX)); + Metrics metrics = new Metrics(metricConfig, reporters, new SystemTime()); + Map tags = new HashMap<>(); + tags.put("name", name); + TopicManagementServiceConfig config = new TopicManagementServiceConfig(props); + String topicFactoryClassName = config.getString(TopicManagementServiceConfig.TOPIC_FACTORY_CLASS_CONFIG); + @SuppressWarnings("rawtypes") + Map topicFactoryConfig = + props.containsKey(TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) ? (Map) props.get( + TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) : new HashMap(); + + _clusterTopicManipulationMetrics = new ClusterTopicManipulationMetrics(metrics, tags); + _zkConnect = config.getString(TopicManagementServiceConfig.ZOOKEEPER_CONNECT_CONFIG); + _topicFactory = + (TopicFactory) Class.forName(topicFactoryClassName).getConstructor(Map.class).newInstance(topicFactoryConfig); + } + + /** + * The start logic must only execute once. If an error occurs then the implementer of this class must assume that + * stop() will be called to clean up. This method must be thread safe and must assume that stop() may be called + * concurrently. This can happen if the monitoring application's life cycle is being managed by a container. Start + * will only be called once. + */ + @Override + public void start() { + if (_running.compareAndSet(false, true)) { + LOGGER.info("ClusterTopicManipulationService started for {} - {}", _configDefinedServiceName, + this.getClass().getCanonicalName()); + Runnable clusterTopicManipulationServiceRunnable = new ClusterTopicManipulationServiceRunnable(); + + _executor.scheduleAtFixedRate(clusterTopicManipulationServiceRunnable, _reportIntervalSecond.getSeconds(), + _reportIntervalSecond.getSeconds(), TimeUnit.SECONDS); + } + } + + private class ClusterTopicManipulationServiceRunnable implements Runnable { + + private ClusterTopicManipulationServiceRunnable() { + // unaccessed. + } + + /** + * When an object implementing interface Runnable is used + * to create a thread, starting the thread causes the object's + * run method to be called in that separately executing + * thread. + *

+ * The general contract of the method run is that it may + * take any action whatsoever. + * + * @see Thread#run() + */ + @Override + public void run() { + try { + ClusterTopicManipulationService.this.createDeleteClusterTopic(); + } catch (Exception e) { + LOGGER.error("{} {} failed to run createDeleteClusterTopic()", _configDefinedServiceName, + ClusterTopicManipulationService.this.getClass().getSimpleName(), e); + } + } + } + + /** + * 1 - Iterates through all the brokers in the cluster. + * 2 - checks the individual log directories of each broker + * 3 - checks how many topic partition of the ongoing topic there are and compares it against the expected value + * The RF is set to the brokerCount currently to enable maximize assigning the many + * partitions and replicas across all the brokers in the clusters as possible. + */ + private void createDeleteClusterTopic() { + + if (_isOngoingTopicCreationDone) { + + int random = ThreadLocalRandom.current().nextInt(); + _currentlyOngoingTopic = XinfraMonitorConstants.TOPIC_MANIPULATION_SERVICE_TOPIC + Math.abs(random); + + try { + int brokerCount = _adminClient.describeCluster().nodes().get().size(); + + Set brokers = new HashSet<>(); + for (Node broker : _adminClient.describeCluster().nodes().get()) { + BrokerMetadata brokerMetadata = new BrokerMetadata(broker.id(), null); + brokers.add(brokerMetadata); + } + Set excludedBrokers = _topicFactory.getExcludedBrokers(_adminClient); + if (!excludedBrokers.isEmpty()) { + brokers.removeIf(broker -> excludedBrokers.contains(broker.id())); + } + + // map from partition id to replica ids (i.e. broker ids). + // good idea for all partitions to have the same number of replicas. + Map> replicasAssignments = new HashMap<>(); + for (int partition = 0; partition < XinfraMonitorConstants.TOPIC_MANIPULATION_TOPIC_NUM_PARTITIONS; + partition++) { + + // Regardless of the replica assignments here, maybeReassignPartitionAndElectLeader() + // will periodically reassign the partition as needed. + replicasAssignments.putIfAbsent(partition, Utils.replicaIdentifiers(brokers)); + } + + CreateTopicsResult createTopicsResult = + _adminClient.createTopics(Collections.singleton(new NewTopic(_currentlyOngoingTopic, replicasAssignments))); + createTopicsResult.all().get(); + _expectedPartitionsCount = brokerCount * XinfraMonitorConstants.TOPIC_MANIPULATION_TOPIC_NUM_PARTITIONS; + _isOngoingTopicCreationDone = false; + LOGGER.debug("Initiated a new topic creation. topic information - topic: {}, cluster broker count: {}", + _currentlyOngoingTopic, brokerCount); + _clusterTopicManipulationMetrics.startTopicCreationMeasurement(); + } catch (InterruptedException | ExecutionException e) { + LOGGER.error("Exception occurred while retrieving the brokers count: ", e); + } + } + + try { + LOGGER.trace("cluster id: {}", _adminClient.describeCluster().clusterId().get()); + Collection brokers = _adminClient.describeCluster().nodes().get(); + + if (this.doesClusterContainTopic(_currentlyOngoingTopic, brokers, _adminClient, _expectedPartitionsCount)) { + _clusterTopicManipulationMetrics.finishTopicCreationMeasurement(); + _isOngoingTopicCreationDone = true; + + if (_isOngoingTopicDeletionDone) { + KafkaFuture deleteTopicFuture = + _adminClient.deleteTopics(Collections.singleton(_currentlyOngoingTopic)).all(); + + _isOngoingTopicDeletionDone = false; + _clusterTopicManipulationMetrics.startTopicDeletionMeasurement(); + LOGGER.debug("clusterTopicManipulationServiceRunnable: Initiated topic deletion on {}.", + _currentlyOngoingTopic); + + deleteTopicFuture.get(); + } + + LOGGER.trace("{}-clusterTopicManipulationServiceRunnable successful!", this.getClass().getSimpleName()); + } + } catch (InterruptedException | ExecutionException e) { + LOGGER.error("Exception occurred while creating cluster topic in {}: ", _configDefinedServiceName, e); + } + + if (!_isOngoingTopicDeletionDone) { + + _clusterTopicManipulationMetrics.finishTopicDeletionMeasurement(); + LOGGER.debug("Finished measuring deleting the topic."); + + _isOngoingTopicDeletionDone = true; + } + } + + /** + * for all brokers, checks if the topic exists in the cluster by iterating through the log dirs of individual brokers. + * @param topic current ongoing topic + * @param brokers brokers to check log dirs from + * @param adminClient Admin Client + * @return true if the cluster contains the topic. + * @throws ExecutionException when attempting to retrieve the result of a task + * that aborted by throwing an exception. + * @throws InterruptedException when a thread is waiting, sleeping, or occupied, + * and the thread is interrupted, either before or during the activity. + */ + private boolean doesClusterContainTopic(String topic, Collection brokers, AdminClient adminClient, + int expectedTotalPartitionsInCluster) throws ExecutionException, InterruptedException { + int totalPartitionsInCluster = 0; + for (Node broker : brokers) { + LOGGER.trace("broker log directories: {}", + adminClient.describeLogDirs(Collections.singleton(broker.id())).all().get()); + Map> logDirectoriesResponseMap = + adminClient.describeLogDirs(Collections.singleton(broker.id())).all().get(); + + totalPartitionsInCluster += this.processBroker(logDirectoriesResponseMap, broker, topic); + } + + if (totalPartitionsInCluster != expectedTotalPartitionsInCluster) { + LOGGER.debug("totalPartitionsInCluster {} does not equal expectedTotalPartitionsInCluster {}", + totalPartitionsInCluster, expectedTotalPartitionsInCluster); + return false; + } + + boolean isDescribeSuccessful = true; + try { + Map topicDescriptions = + ClusterTopicManipulationService.describeTopics(adminClient, Collections.singleton(topic)); + LOGGER.trace("topicDescriptionMap = {}", topicDescriptions); + } catch (InterruptedException | ExecutionException e) { + isDescribeSuccessful = false; + LOGGER.error("Exception occurred within describeTopicsFinished method for topics {}", + Collections.singleton(topic), e); + } + + LOGGER.trace("isDescribeSuccessful: {}", isDescribeSuccessful); + return isDescribeSuccessful; + } + + /** + * Waits if necessary for this future to complete and gets the future in a blocking fashion. + * returns Map if the future succeeds, which occurs only if all the topic descriptions are successful. + * @param adminClient administrative client for Kafka, supporting managing and inspecting topics, brokers, configurations and ACLs. + * @param topicNames Collection of topic names + * @return Map if describe topic succeeds. + */ + private static Map describeTopics(AdminClient adminClient, Collection topicNames) + throws InterruptedException, ExecutionException { + KafkaFuture> mapKafkaFuture = adminClient.describeTopics(topicNames).all(); + LOGGER.debug("describeTopics future: {}", mapKafkaFuture); + LOGGER.debug("describeTopics: {}", mapKafkaFuture.get()); + + return mapKafkaFuture.get(); + } + + /** + * iterates through the broker's log directories and checks for the ongoing topic partitions and replica's existence. + * @param logDirectoriesResponseMap map of log directories response in the broker + * @param broker broker to process the log dirs in + * @param topic ongoing kmf manipulation topic + */ + int processBroker(Map> logDirectoriesResponseMap, + Node broker, String topic) { + int totalPartitionsInBroker = 0; + LOGGER.trace("logDirectoriesResponseMap: {}", logDirectoriesResponseMap); + Map logDirInfoMap = logDirectoriesResponseMap.get(broker.id()); + String logDirectoriesKey = logDirInfoMap.keySet().iterator().next(); + LOGGER.trace("logDirInfoMap: {}", logDirInfoMap.get(logDirectoriesKey)); + DescribeLogDirsResponse.LogDirInfo logDirInfo = logDirInfoMap.get(logDirectoriesKey); + + if (logDirInfo != null && !logDirectoriesResponseMap.isEmpty()) { + Map topicPartitionReplicaInfoMap = logDirInfo.replicaInfos; + totalPartitionsInBroker += this.processLogDirsWithinBroker(topicPartitionReplicaInfoMap, topic, broker); + } + + return totalPartitionsInBroker; + } + + private int processLogDirsWithinBroker( + Map topicPartitionReplicaInfoMap, String topic, + Node broker) { + int totalPartitionsInBroker = 0; + for (Map.Entry topicPartitionReplicaInfoEntry : topicPartitionReplicaInfoMap + .entrySet()) { + + TopicPartition topicPartition = topicPartitionReplicaInfoEntry.getKey(); + DescribeLogDirsResponse.ReplicaInfo replicaInfo = topicPartitionReplicaInfoEntry.getValue(); + + if (topicPartition.topic().equals(topic)) { + totalPartitionsInBroker++; + LOGGER.trace("totalPartitions In The Broker = {}", totalPartitionsInBroker); + } + + LOGGER.trace("broker information: {}", broker); + LOGGER.trace("logDirInfo for kafka-logs: topicPartition = {}, replicaInfo = {}", topicPartition, replicaInfo); + } + + return totalPartitionsInBroker; + } + + /** + * This may be called multiple times. This method must be thread safe and must assume that start() may be called + * concurrently. This can happen if the monitoring application's life cycle is being managed by a container. + * Implementations must be non-blocking and should release the resources acquired by the service during start(). + */ + @Override + public void stop() { + if (_running.compareAndSet(true, false)) { + _executor.shutdown(); + } + } + + /** + * Implementations of this method must be thread safe as it can be called at any time. Implementations must be + * non-blocking. + * @return true if this start() has returned successfully else this must return false. This must also return false if + * the service can no longer perform its function. + */ + @Override + public boolean isRunning() { + + return _running.get() && !_executor.isShutdown(); + } + + /** + * Implementations of this method must be thread safe and must be blocking. + */ + @Override + public void awaitShutdown(long timeout, TimeUnit timeUnit) { + + try { + _executor.awaitTermination(3, TimeUnit.MINUTES); + LOGGER.info("{} shutdown completed", _configDefinedServiceName); + } catch (InterruptedException e) { + LOGGER.info("Thread interrupted when waiting for {} to shutdown", _configDefinedServiceName); + } + } + + @Override + public String toString() { + return this.getClass().getSimpleName() + "-" + _configDefinedServiceName; + } + + void setExpectedPartitionsCount(int count) { + _expectedPartitionsCount = count; + } + + int expectedPartitionsCount() { + return _expectedPartitionsCount; + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceFactory.java new file mode 100644 index 00000000..2d932174 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceFactory.java @@ -0,0 +1,50 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; +import org.apache.kafka.clients.admin.AdminClient; + + +/** + * Factory class which instantiates a ClusterTopicManipulationService service object. + */ +@SuppressWarnings("rawtypes") +public class ClusterTopicManipulationServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + /** + * "Class 'ClusterTopicManipulationServiceFactory' is never used" and + * "Constructor 'ClusterTopicManipulationServiceFactory(java.util.Map, java.lang.String)' is never used" + * shown as warnings in Intellij IDEA are not true. + * XinfraMonitor class uses (ServiceFactory) Class.forName(..) + * .getConstructor(...).newInstance(...) to return Class that's associated + * with the class or interface with the given string name + * @param properties config properties + * @param serviceName name of the service + */ + public ClusterTopicManipulationServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + + AdminClient adminClient = AdminClient.create(_properties); + + return new ClusterTopicManipulationService(_serviceName, adminClient, _properties); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeService.java b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeService.java new file mode 100644 index 00000000..53f32360 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeService.java @@ -0,0 +1,271 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.common.DefaultTopicSchema; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.consumer.BaseConsumerRecord; +import com.linkedin.xinfra.monitor.consumer.KMBaseConsumer; +import com.linkedin.xinfra.monitor.services.metrics.CommitAvailabilityMetrics; +import com.linkedin.xinfra.monitor.services.metrics.CommitLatencyMetrics; +import com.linkedin.xinfra.monitor.services.metrics.ConsumeMetrics; +import java.time.Duration; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.metrics.JmxReporter; +import org.apache.kafka.common.metrics.MetricConfig; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.MetricsReporter; +import org.apache.kafka.common.metrics.stats.CumulativeSum; +import org.apache.kafka.common.utils.SystemTime; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ConsumeService extends AbstractService { + private static final Logger LOG = LoggerFactory.getLogger(ConsumeService.class); + private static final String TAGS_NAME = "name"; + private static final long COMMIT_TIME_INTERVAL = 4; + private static final long CONSUME_THREAD_SLEEP_MS = 100; + private static Metrics metrics; + private final AtomicBoolean _running; + private final KMBaseConsumer _baseConsumer; + private final int _latencySlaMs; + private ConsumeMetrics _sensors; + private Thread _consumeThread; + private final AdminClient _adminClient; + private CommitAvailabilityMetrics _commitAvailabilityMetrics; + private CommitLatencyMetrics _commitLatencyMetrics; + private String _topic; + private final String _name; + private static final String METRIC_GROUP_NAME = "consume-service"; + private static Map tags; + + /** + * Mainly contains services for three metrics: + * 1 - ConsumeAvailability metrics + * 2 - CommitOffsetAvailability metrics + * 2.1 - commitAvailabilityMetrics records offsets committed upon success. that is, no exception upon callback + * 2.2 - commitAvailabilityMetrics records offsets commit fail upon failure. that is, exception upon callback + * 3 - CommitOffsetLatency metrics + * 3.1 - commitLatencyMetrics records the latency between last successful callback and start of last recorded commit. + * + * @param name Name of the Monitor instance + * @param topicPartitionResult The completable future for topic partition + * @param consumerFactory Consumer Factory object. + * @throws ExecutionException when attempting to retrieve the result of a task that aborted by throwing an exception + * @throws InterruptedException when a thread is waiting, sleeping, or otherwise occupied and the thread is interrupted + */ + public ConsumeService(String name, + CompletableFuture topicPartitionResult, + ConsumerFactory consumerFactory) + throws ExecutionException, InterruptedException { + // TODO: Make values of below fields come from configs + super(10, Duration.ofMinutes(1)); + _baseConsumer = consumerFactory.baseConsumer(); + _latencySlaMs = consumerFactory.latencySlaMs(); + _name = name; + _adminClient = consumerFactory.adminClient(); + _running = new AtomicBoolean(false); + + // Returns a new CompletionStage (topicPartitionFuture) which + // executes the given action - code inside run() - when this stage (topicPartitionResult) completes normally,. + CompletableFuture topicPartitionFuture = topicPartitionResult.thenRun(() -> { + MetricConfig metricConfig = new MetricConfig().samples(60).timeWindow(1000, TimeUnit.MILLISECONDS); + List reporters = new ArrayList<>(); + reporters.add(new JmxReporter(JMX_PREFIX)); + metrics = new Metrics(metricConfig, reporters, new SystemTime()); + tags = new HashMap<>(); + tags.put(TAGS_NAME, name); + _topic = consumerFactory.topic(); + _sensors = new ConsumeMetrics(metrics, tags, consumerFactory.latencyPercentileMaxMs(), + consumerFactory.latencyPercentileGranularityMs()); + _commitLatencyMetrics = new CommitLatencyMetrics(metrics, tags, consumerFactory.latencyPercentileMaxMs(), + consumerFactory.latencyPercentileGranularityMs()); + _commitAvailabilityMetrics = new CommitAvailabilityMetrics(metrics, tags); + _consumeThread = new Thread(() -> { + try { + consume(); + } catch (Exception e) { + LOG.error(name + "/ConsumeService failed", e); + } + }, name + " consume-service"); + _consumeThread.setDaemon(true); + _consumeThread.setUncaughtExceptionHandler((t, e) -> { + LOG.error(name + "/ConsumeService error", e); + }); + }); + + // In a blocking fashion, waits for this topicPartitionFuture to complete, and then returns its result. + topicPartitionFuture.get(); + } + + private void consume() throws Exception { + /* Delay 1 second to reduce the chance that consumer creates topic before TopicManagementService */ + Thread.sleep(1000); + + Map nextIndexes = new HashMap<>(); + + while (_running.get()) { + BaseConsumerRecord record; + try { + record = _baseConsumer.receive(); + } catch (Exception e) { + _sensors._consumeError.record(); + LOG.warn(_name + "/ConsumeService failed to receive record", e); + /* Avoid busy while loop */ + //noinspection BusyWait + Thread.sleep(CONSUME_THREAD_SLEEP_MS); + continue; + } + + if (record == null) continue; + + GenericRecord avroRecord = null; + try { + avroRecord = Utils.genericRecordFromJson(record.value()); + } catch (Exception exception) { + LOG.error("An exception occurred while getting avro record.", exception); + } + + if (avroRecord == null) { + _sensors._consumeError.record(); + continue; + } + int partition = record.partition(); + /* Commit availability and commit latency service */ + /* Call commitAsync, wait for a NON-NULL return value (see https://issues.apache.org/jira/browse/KAFKA-6183) */ + OffsetCommitCallback commitCallback = new OffsetCommitCallback() { + @Override + public void onComplete(Map topicPartitionOffsetAndMetadataMap, Exception kafkaException) { + if (kafkaException != null) { + LOG.error("Exception while trying to perform an asynchronous commit.", kafkaException); + _commitAvailabilityMetrics._failedCommitOffsets.record(); + } else { + _commitAvailabilityMetrics._offsetsCommitted.record(); + _commitLatencyMetrics.recordCommitComplete(); + } + } + }; + + /* Current timestamp to perform subtraction*/ + long currTimeMillis = System.currentTimeMillis(); + + /* 4 seconds consumer offset commit interval. */ + long timeDiffMillis = TimeUnit.SECONDS.toMillis(COMMIT_TIME_INTERVAL); + + if (currTimeMillis - _baseConsumer.lastCommitted() >= timeDiffMillis) { + /* commit the consumer offset asynchronously with a callback. */ + _baseConsumer.commitAsync(commitCallback); + _commitLatencyMetrics.recordCommitStart(); + /* Record the current time for the committed consumer offset */ + _baseConsumer.updateLastCommit(); + } + /* Finished consumer offset commit service. */ + + long index = (Long) avroRecord.get(DefaultTopicSchema.INDEX_FIELD.name()); + long currMs = System.currentTimeMillis(); + long prevMs = (Long) avroRecord.get(DefaultTopicSchema.TIME_FIELD.name()); + + _sensors._recordsConsumed.record(); + _sensors._bytesConsumed.record(record.value().length()); + _sensors._recordsDelay.record(currMs - prevMs); + + if (currMs - prevMs > _latencySlaMs) + _sensors._recordsDelayed.record(); + + if (index == -1L || !nextIndexes.containsKey(partition)) { + nextIndexes.put(partition, -1L); + continue; + } + + long nextIndex = nextIndexes.get(partition); + + if (nextIndex == -1 || index == nextIndex) { + nextIndexes.put(partition, index + 1); + + } else if (index < nextIndex) { + _sensors._recordsDuplicated.record(); + } else { // this will equate to the case where index > nextIndex... + nextIndexes.put(partition, index + 1); + long numLostRecords = index - nextIndex; + _sensors._recordsLost.record(numLostRecords); + LOG.info("_recordsLost recorded: Avro record current index: {} at timestamp {}. Next index: {}. Lost {} records.", index, currMs, nextIndex, numLostRecords); + } + } + /* end of consume() while loop */ + LOG.info("{}/ConsumeService/Consumer closing.", _name); + _baseConsumer.close(); + LOG.info("{}/ConsumeService/Consumer stopped.", _name); + } + + Metrics metrics() { + return metrics; + } + + void startConsumeThreadForTesting() { + if (_running.compareAndSet(false, true)) { + _consumeThread.start(); + LOG.info("{}/ConsumeService started.", _name); + } + } + + @Override + public synchronized void start() { + if (_running.compareAndSet(false, true)) { + _consumeThread.start(); + LOG.info("{}/ConsumeService started.", _name); + + TopicDescription topicDescription = getTopicDescription(_adminClient, _topic); + @SuppressWarnings("ConstantConditions") + double partitionCount = topicDescription.partitions().size(); + metrics.sensor("topic-partitions").add( + new MetricName("topic-partitions-count", METRIC_GROUP_NAME, "The total number of partitions for the topic.", + tags), new CumulativeSum(partitionCount)); + } + } + + @Override + public synchronized void stop() { + if (_running.compareAndSet(true, false)) { + LOG.info("{}/ConsumeService stopping.", _name); + } + } + + @Override + public void awaitShutdown(long timeout, TimeUnit unit) { + LOG.info("{}/ConsumeService shutdown awaiting…", _name); + try { + _consumeThread.join(unit.toMillis(timeout)); + } catch (InterruptedException e) { + LOG.error(_name + "/ConsumeService interrupted", e); + } + LOG.info("{}/ConsumeService shutdown completed.", _name); + } + + @Override + public boolean isRunning() { + return _running.get() && _consumeThread.isAlive(); + } + +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeServiceFactory.java new file mode 100644 index 00000000..c3a290ed --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumeServiceFactory.java @@ -0,0 +1,39 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; +import java.util.concurrent.CompletableFuture; + + +/** + * Factory that constructs the ConsumeService. + */ +@SuppressWarnings({"rawtypes", "unchecked"}) +public class ConsumeServiceFactory implements ServiceFactory { + private final Map _props; + private final String _name; + + public ConsumeServiceFactory(Map props, String name) { + _props = props; + _name = name; + } + + @Override + public Service createService() throws Exception { + + CompletableFuture topicPartitionResult = new CompletableFuture<>(); + topicPartitionResult.complete(null); + ConsumerFactoryImpl consumerFactory = new ConsumerFactoryImpl(_props); + + return new ConsumeService(_name, topicPartitionResult, consumerFactory); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactory.java new file mode 100644 index 00000000..e603550b --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactory.java @@ -0,0 +1,32 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.consumer.KMBaseConsumer; +import org.apache.kafka.clients.admin.AdminClient; + + +public interface ConsumerFactory { + + AdminClient adminClient(); + + int latencySlaMs(); + + KMBaseConsumer baseConsumer(); + + String topic(); + + int latencyPercentileMaxMs(); + + int latencyPercentileGranularityMs(); + +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactoryImpl.java b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactoryImpl.java new file mode 100644 index 00000000..07943db8 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ConsumerFactoryImpl.java @@ -0,0 +1,138 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.consumer.KMBaseConsumer; +import com.linkedin.xinfra.monitor.consumer.NewConsumer; +import com.linkedin.xinfra.monitor.services.configs.CommonServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.ConsumeServiceConfig; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.Random; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class ConsumerFactoryImpl implements ConsumerFactory { + private final KMBaseConsumer _baseConsumer; + private final String _topic; + private static final String FALSE = "false"; + private final int _latencyPercentileMaxMs; + private final int _latencyPercentileGranularityMs; + private static final String[] NON_OVERRIDABLE_PROPERTIES = + new String[] {ConsumeServiceConfig.BOOTSTRAP_SERVERS_CONFIG, ConsumeServiceConfig.ZOOKEEPER_CONNECT_CONFIG}; + private final int _latencySlaMs; + private static AdminClient adminClient; + private static final Logger LOG = LoggerFactory.getLogger(ConsumerFactoryImpl.class); + + @SuppressWarnings("rawtypes") + public ConsumerFactoryImpl(Map props) throws Exception { + LOG.info("Creating AdminClient."); + adminClient = AdminClient.create(props); + Map consumerPropsOverride = props.containsKey(ConsumeServiceConfig.CONSUMER_PROPS_CONFIG) + ? (Map) props.get(ConsumeServiceConfig.CONSUMER_PROPS_CONFIG) : new HashMap<>(); + ConsumeServiceConfig config = new ConsumeServiceConfig(props); + _topic = config.getString(ConsumeServiceConfig.TOPIC_CONFIG); + String zkConnect = config.getString(ConsumeServiceConfig.ZOOKEEPER_CONNECT_CONFIG); + String brokerList = config.getString(ConsumeServiceConfig.BOOTSTRAP_SERVERS_CONFIG); + String consumerClassName = config.getString(ConsumeServiceConfig.CONSUMER_CLASS_CONFIG); + _latencySlaMs = config.getInt(ConsumeServiceConfig.LATENCY_SLA_MS_CONFIG); + _latencyPercentileMaxMs = config.getInt(ConsumeServiceConfig.LATENCY_PERCENTILE_MAX_MS_CONFIG); + _latencyPercentileGranularityMs = config.getInt(ConsumeServiceConfig.LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG); + for (String property: NON_OVERRIDABLE_PROPERTIES) { + if (consumerPropsOverride.containsKey(property)) { + throw new ConfigException("Override must not contain " + property + " config."); + } + } + Properties consumerProps = new Properties(); + + /* Assign default config. This has the lowest priority. */ + consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, FALSE); + consumerProps.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); + consumerProps.put(ConsumerConfig.CLIENT_ID_CONFIG, "kmf-consumer"); + consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "kmf-consumer-group-" + new Random().nextInt()); + consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + if (consumerClassName.equals(NewConsumer.class.getCanonicalName()) || consumerClassName.equals(NewConsumer.class.getSimpleName())) { + consumerClassName = NewConsumer.class.getCanonicalName(); + } + + /* Assign config specified for ConsumeService. */ + consumerProps.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); + consumerProps.put(CommonServiceConfig.ZOOKEEPER_CONNECT_CONFIG, zkConnect); + + /* Assign config specified for consumer. This has the highest priority. */ + consumerProps.putAll(consumerPropsOverride); + + if (props.containsKey(ConsumeServiceConfig.CONSUMER_PROPS_CONFIG)) { + props.forEach(consumerProps::putIfAbsent); + } + + java.lang.reflect.Constructor constructor = adminClientConstructorIfExists(consumerClassName); + if (constructor != null) { + _baseConsumer = (KMBaseConsumer) constructor + .newInstance(_topic, consumerProps, adminClient()); + } else { + _baseConsumer = (KMBaseConsumer) Class.forName(consumerClassName) + .getConstructor(String.class, Properties.class) + .newInstance(_topic, consumerProps); + } + } + + private static java.lang.reflect.Constructor adminClientConstructorIfExists(String consumerClassName) + throws ClassNotFoundException { + try { + return Class.forName(consumerClassName).getConstructor(String.class, Properties.class, AdminClient.class); + } catch (java.lang.NoSuchMethodException noSuchMethodException) { + LOG.info(consumerClassName + + " does not provide a constructor with signature (Ljava/lang/String;Ljava/util/Properties;Lorg/apache/kafka/clients/admin/AdminClient;)V - falling back to (Ljava/util/Properties;)V"); + return null; + } catch (ClassNotFoundException e) { + throw new ClassNotFoundException("The class was not found: ", e); + } + } + + @Override + public AdminClient adminClient() { + return adminClient; + } + + @Override + public int latencySlaMs() { + return _latencySlaMs; + } + + @Override + public KMBaseConsumer baseConsumer() { + return _baseConsumer; + } + + @Override + public String topic() { + return _topic; + } + + @Override + public int latencyPercentileMaxMs() { + return _latencyPercentileMaxMs; + } + + @Override + public int latencyPercentileGranularityMs() { + return _latencyPercentileGranularityMs; + } + +} diff --git a/src/main/java/com/linkedin/kmf/services/DefaultMetricsReporterService.java b/src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterService.java similarity index 68% rename from src/main/java/com/linkedin/kmf/services/DefaultMetricsReporterService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterService.java index b6f8dac3..64a62bf1 100644 --- a/src/main/java/com/linkedin/kmf/services/DefaultMetricsReporterService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,22 +7,23 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; -import static com.linkedin.kmf.common.Utils.getMBeanAttributeValues; +package com.linkedin.xinfra.monitor.services; -import com.linkedin.kmf.common.MbeanAttributeValue; -import com.linkedin.kmf.services.configs.DefaultMetricsReporterServiceConfig; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import com.linkedin.xinfra.monitor.common.MbeanAttributeValue; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.DefaultMetricsReporterServiceConfig; import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DefaultMetricsReporterService implements Service { private static final Logger LOG = LoggerFactory.getLogger(DefaultMetricsReporterService.class); + private static final String LOG_DIVIDER = "=============================================================="; private final String _name; private final List _metricNames; @@ -39,25 +40,20 @@ public DefaultMetricsReporterService(Map props, String name) { @Override public synchronized void start() { - _executor.scheduleAtFixedRate( - new Runnable() { - @Override - public void run() { - try { - reportMetrics(); - } catch (Exception e) { - LOG.error(_name + "/DefaultMetricsReporterService failed to report metrics", e); - } - } - }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS - ); - LOG.info("{}/DefaultMetricsReporterService started", _name); + _executor.scheduleAtFixedRate(() -> { + try { + reportMetrics(); + } catch (Exception e) { + LOG.error(_name + "/DefaultMetricsReporterService failed to report metrics.", e); + } + }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS); + LOG.info("{}/DefaultMetricsReporterService started.", _name); } @Override public synchronized void stop() { _executor.shutdown(); - LOG.info("{}/DefaultMetricsReporterService stopped", _name); + LOG.info("{}/DefaultMetricsReporterService stopped.", _name); } @Override @@ -66,26 +62,28 @@ public boolean isRunning() { } @Override - public void awaitShutdown() { + public void awaitShutdown(long timeout, TimeUnit unit) { try { _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { - LOG.info("Thread interrupted when waiting for {}/DefaultMetricsReporterService to shutdown", _name); + LOG.info("Thread interrupted when waiting for {}/DefaultMetricsReporterService to shutdown.", _name); } - LOG.info("{}/DefaultMetricsReporterService shutdown completed", _name); + LOG.info("{}/DefaultMetricsReporterService shutdown completed.", _name); } + + private void reportMetrics() { StringBuilder builder = new StringBuilder(); for (String metricName: _metricNames) { String mbeanExpr = metricName.substring(0, metricName.lastIndexOf(":")); String attributeExpr = metricName.substring(metricName.lastIndexOf(":") + 1); - List attributeValues = getMBeanAttributeValues(mbeanExpr, attributeExpr); + List attributeValues = Utils.getMBeanAttributeValues(mbeanExpr, attributeExpr); for (MbeanAttributeValue attributeValue: attributeValues) { builder.append(attributeValue.toString()); builder.append("\n"); } } - LOG.info(builder.toString()); + LOG.info("{}\n{}", LOG_DIVIDER, builder.toString()); } } diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterServiceFactory.java new file mode 100644 index 00000000..9d0acec0 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/DefaultMetricsReporterServiceFactory.java @@ -0,0 +1,35 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class which instantiates a DefaultMetricsReporterService. + */ +@SuppressWarnings("rawtypes") +public class DefaultMetricsReporterServiceFactory implements ServiceFactory { + private final Map _properties; + private final String _serviceName; + + public DefaultMetricsReporterServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() { + return new DefaultMetricsReporterService(_properties, _serviceName); + } +} diff --git a/src/main/java/com/linkedin/kmf/services/GraphiteMetricsReporterService.java b/src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterService.java similarity index 81% rename from src/main/java/com/linkedin/kmf/services/GraphiteMetricsReporterService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterService.java index 706defd5..24512d7d 100644 --- a/src/main/java/com/linkedin/kmf/services/GraphiteMetricsReporterService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,20 +7,12 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; +package com.linkedin.xinfra.monitor.services; -import static com.linkedin.kmf.common.Utils.getMBeanAttributeValues; - -import com.linkedin.kmf.common.MbeanAttributeValue; -import com.linkedin.kmf.services.configs.GraphiteMetricsReporterServiceConfig; -import net.savantly.graphite.GraphiteClient; -import net.savantly.graphite.GraphiteClientFactory; -import net.savantly.graphite.impl.SimpleCarbonMetric; -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import com.linkedin.xinfra.monitor.common.MbeanAttributeValue; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.GraphiteMetricsReporterServiceConfig; import java.net.SocketException; import java.net.UnknownHostException; import java.util.List; @@ -28,6 +20,12 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import net.savantly.graphite.GraphiteClient; +import net.savantly.graphite.GraphiteClientFactory; +import net.savantly.graphite.impl.SimpleCarbonMetric; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class GraphiteMetricsReporterService implements Service { private static final Logger LOG = LoggerFactory.getLogger(GraphiteMetricsReporterService.class); @@ -54,17 +52,17 @@ public GraphiteMetricsReporterService(Map props, String name) @Override public synchronized void start() { - _executor.scheduleAtFixedRate( - new Runnable() { - @Override - public void run() { - try { - reportMetrics(); - } catch (Exception e) { - LOG.error(_name + "/GraphiteMetricsReporterService failed to report metrics", e); - } - } - }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS + _executor.scheduleAtFixedRate(new Runnable() { + @Override + public void run() { + try { + GraphiteMetricsReporterService.this.reportMetrics(); + } catch (Exception e) { + LOG.error(_name + "/GraphiteMetricsReporterService failed to report metrics", + e); + } + } + }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS ); LOG.info("{}/GraphiteMetricsReporterService started", _name); } @@ -81,7 +79,7 @@ public boolean isRunning() { } @Override - public void awaitShutdown() { + public void awaitShutdown(long timeout, TimeUnit unit) { try { _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { @@ -104,7 +102,7 @@ private void reportMetrics() { for (String metricName: _metricNames) { String mbeanExpr = metricName.substring(0, metricName.lastIndexOf(":")); String attributeExpr = metricName.substring(metricName.lastIndexOf(":") + 1); - List attributeValues = getMBeanAttributeValues(mbeanExpr, attributeExpr); + List attributeValues = Utils.getMBeanAttributeValues(mbeanExpr, attributeExpr); for (MbeanAttributeValue attributeValue: attributeValues) { _graphiteClient.saveCarbonMetrics( new SimpleCarbonMetric( diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterServiceFactory.java new file mode 100644 index 00000000..ed1c5886 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/GraphiteMetricsReporterServiceFactory.java @@ -0,0 +1,36 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class which instantiates a GraphiteMetricsReporterServiceFactory service. + */ +@SuppressWarnings("rawtypes") +public class GraphiteMetricsReporterServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + public GraphiteMetricsReporterServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + return new GraphiteMetricsReporterService(_properties, _serviceName); + } +} diff --git a/src/main/java/com/linkedin/kmf/services/JolokiaService.java b/src/main/java/com/linkedin/xinfra/monitor/services/JolokiaService.java similarity index 84% rename from src/main/java/com/linkedin/kmf/services/JolokiaService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/JolokiaService.java index 5218cb32..ae1806e7 100644 --- a/src/main/java/com/linkedin/kmf/services/JolokiaService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/JolokiaService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,21 +7,22 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.jolokia.jvmagent.JolokiaServer; import org.jolokia.jvmagent.JvmAgentConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; - /** * Jolokia server allows user to query jmx metric value with HTTP request */ public class JolokiaService implements Service { - private static final Logger LOG = LoggerFactory.getLogger(JettyService.class); + private static final Logger LOG = LoggerFactory.getLogger(JolokiaService.class); private final String _name; private final JolokiaServer _jolokiaServer; @@ -51,7 +52,7 @@ public boolean isRunning() { return _isRunning.get(); } - public void awaitShutdown() { + public void awaitShutdown(long timeout, TimeUnit timeUnit) { } diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/JolokiaServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/JolokiaServiceFactory.java new file mode 100644 index 00000000..809eb630 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/JolokiaServiceFactory.java @@ -0,0 +1,36 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class which instantiates a JolokiaService service. + */ +@SuppressWarnings("rawtypes") +public class JolokiaServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + public JolokiaServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + return new JolokiaService(_properties, _serviceName); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterService.java b/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterService.java new file mode 100644 index 00000000..4027dc08 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterService.java @@ -0,0 +1,134 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.xinfra.monitor.common.MbeanAttributeValue; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.KafkaMetricsReporterServiceConfig; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class KafkaMetricsReporterService implements Service { + private static final Logger LOGGER = LoggerFactory.getLogger(KafkaMetricsReporterService.class); + private static final String METRICS_PRODUCER_ID = "kafka-metrics-reporter-id"; + private final String _name; + private final List _metricsNames; + private final int _reportIntervalSec; + private final ScheduledExecutorService _executor; + private KafkaProducer _producer; + private final String _brokerList; + private final String _topic; + private final ObjectMapper _parser = new ObjectMapper(); + + public KafkaMetricsReporterService(Map props, String name, AdminClient adminClient) throws Exception { + _name = name; + KafkaMetricsReporterServiceConfig config = new KafkaMetricsReporterServiceConfig(props); + _metricsNames = config.getList(KafkaMetricsReporterServiceConfig.REPORT_METRICS_CONFIG); + _reportIntervalSec = config.getInt(KafkaMetricsReporterServiceConfig.REPORT_INTERVAL_SEC_CONFIG); + _executor = Executors.newSingleThreadScheduledExecutor(); + _brokerList = config.getString(KafkaMetricsReporterServiceConfig.BOOTSTRAP_SERVERS_CONFIG); + initializeProducer(); + _topic = config.getString(KafkaMetricsReporterServiceConfig.TOPIC_CONFIG); + Integer rf = config.getInt(KafkaMetricsReporterServiceConfig.TOPIC_REPLICATION_FACTOR); + Utils.createTopicIfNotExists( + _topic, + rf.shortValue(), + 0, // parameter is set to 0 here since no matter the number of nodes, the topic partition number should be set to zero. + 1, // fixed partition count 1 + new Properties(), + adminClient + ); + } + + @Override + public synchronized void start() { + _executor.scheduleAtFixedRate(() -> { + try { + reportMetrics(); + } catch (Exception e) { + LOGGER.error(_name + "/KafkaMetricsReporterService failed to report metrics.", e); + } + }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS); + LOGGER.info("{}/KafkaMetricsReporterService has started.", _name); + } + + @Override + public synchronized void stop() { + _executor.shutdown(); + _producer.close(); + LOGGER.info("{}/KafkaMetricsReporterService stopped.", _name); + } + + @Override + public boolean isRunning() { + return !_executor.isShutdown(); + } + + @Override + public void awaitShutdown(long timeout, TimeUnit timeUnit) { + try { + _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + LOGGER.info("Thread interrupted when waiting for {}/KafkaMetricsReporterService to shutdown", _name); + } + LOGGER.info("{}/KafkaMetricsReporterService shutdown completed", _name); + } + + + private void initializeProducer() { + Properties producerProps = new Properties(); + producerProps.put(ProducerConfig.ACKS_CONFIG, "-1"); + producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, "20000"); + producerProps.put(ProducerConfig.RETRIES_CONFIG, "3"); + producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, String.valueOf(Long.MAX_VALUE)); + producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1"); + producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, METRICS_PRODUCER_ID); + producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, _brokerList); + _producer = new KafkaProducer<>(producerProps); + } + + private void reportMetrics() { + Map metrics = new HashMap<>(); + for (String metricName : _metricsNames) { + String mbeanExpr = metricName.substring(0, metricName.lastIndexOf(":")); + String attributeExpr = metricName.substring(metricName.lastIndexOf(":") + 1); + List attributeValues = Utils.getMBeanAttributeValues(mbeanExpr, attributeExpr); + for (MbeanAttributeValue attributeValue : attributeValues) { + String metric = attributeValue.toString(); + String key = metric.substring(0, metric.lastIndexOf("=")); + String val = metric.substring(metric.lastIndexOf("=") + 1); + metrics.put(key, val); + } + } + try { + LOGGER.info("Kafka Metrics Reporter sending metrics = " + _parser.writerWithDefaultPrettyPrinter().writeValueAsString(metrics)); + _producer.send(new ProducerRecord<>(_topic, _parser.writeValueAsString(metrics))); + } catch (JsonProcessingException e) { + LOGGER.warn("unsupported json format: " + metrics, e); + } + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterServiceFactory.java new file mode 100644 index 00000000..1eaa1419 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/KafkaMetricsReporterServiceFactory.java @@ -0,0 +1,41 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; +import org.apache.kafka.clients.admin.AdminClient; + + +/** + * Factory class which instantiates a KafkaMetricsReporterService service object. + */ +@SuppressWarnings("rawtypes") +public class KafkaMetricsReporterServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + public KafkaMetricsReporterServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + + AdminClient adminClient = AdminClient.create(_properties); + + return new KafkaMetricsReporterService(_properties, _serviceName, adminClient); + + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementService.java b/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementService.java new file mode 100644 index 00000000..dca1eb65 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementService.java @@ -0,0 +1,704 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.CommonServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.topicfactory.TopicFactory; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.CancellationException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import kafka.admin.AdminUtils; +import kafka.admin.BrokerMetadata; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.AlterConfigOp; +import org.apache.kafka.clients.admin.AlterPartitionReassignmentsResult; +import org.apache.kafka.clients.admin.Config; +import org.apache.kafka.clients.admin.ConfigEntry; +import org.apache.kafka.clients.admin.ElectLeadersResult; +import org.apache.kafka.clients.admin.NewPartitionReassignment; +import org.apache.kafka.clients.admin.NewPartitions; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.ElectionType; +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.KafkaFuture; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.TopicPartitionInfo; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.config.ConfigResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Option$; +import scala.collection.JavaConverters; +import scala.collection.Seq; + + +/** + * This service periodically checks and re-balances the monitor topics across a pipeline of Kafka clusters so that + * leadership of the partitions of the monitor topic in each cluster is distributed evenly across brokers in the cluster. + * + * More specifically, this service may do some or all of the following tasks depending on the config: + * + * - Create the monitor topic using the user-specified replication factor and partition number + * - Increase partition number of the monitor topic if either partitionsToBrokersRatio or minPartitionNum is not satisfied + * - Increase replication factor of the monitor topic if the user-specified replicationFactor is not satisfied + * - Reassign partition across brokers to make sure each broker acts as preferred leader of at least one partition of the monitor topic + * - Trigger preferred leader election to make sure each broker acts as the leader of at least one partition of the monitor topic. + * - Make sure the number of partitions of the monitor topic is same across all monitored clusters. + * + */ +@SuppressWarnings({"rawtypes", "unchecked"}) +public class MultiClusterTopicManagementService implements Service { + private static final Logger LOGGER = LoggerFactory.getLogger(MultiClusterTopicManagementService.class); + private static final String METRIC_GROUP_NAME = "topic-management-service"; + private final CompletableFuture _topicPartitionResult = new CompletableFuture<>(); + private final AtomicBoolean _isRunning = new AtomicBoolean(false); + private final String _serviceName; + private final Map _topicManagementByCluster; + private final int _rebalanceIntervalMs; + private final long _preferredLeaderElectionIntervalMs; + private final ScheduledExecutorService _executor; + + @SuppressWarnings("unchecked") + public MultiClusterTopicManagementService(Map props, String serviceName) throws Exception { + _serviceName = serviceName; + MultiClusterTopicManagementServiceConfig config = new MultiClusterTopicManagementServiceConfig(props); + String topic = config.getString(CommonServiceConfig.TOPIC_CONFIG); + Map propsByCluster = + props.containsKey(MultiClusterTopicManagementServiceConfig.PROPS_PER_CLUSTER_CONFIG) ? (Map) props.get( + MultiClusterTopicManagementServiceConfig.PROPS_PER_CLUSTER_CONFIG) : new HashMap<>(); + _topicManagementByCluster = initializeTopicManagementHelper(propsByCluster, topic); + _rebalanceIntervalMs = config.getInt(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG); + _preferredLeaderElectionIntervalMs = + config.getLong(MultiClusterTopicManagementServiceConfig.PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG); + _executor = Executors.newSingleThreadScheduledExecutor( + r -> new Thread(r, _serviceName + "-multi-cluster-topic-management-service")); + _topicPartitionResult.complete(null); + } + + public CompletableFuture topicPartitionResult() { + return _topicPartitionResult; + } + + private Map initializeTopicManagementHelper(Map propsByCluster, + String topic) throws Exception { + Map topicManagementByCluster = new HashMap<>(); + for (Map.Entry entry : propsByCluster.entrySet()) { + String clusterName = entry.getKey(); + Map serviceProps = entry.getValue(); + if (serviceProps.containsKey(MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG)) { + throw new ConfigException("The raw per-cluster config for MultiClusterTopicManagementService must not contain " + + MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG); + } + serviceProps.put(MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG, topic); + topicManagementByCluster.put(clusterName, new TopicManagementHelper(serviceProps)); + } + return topicManagementByCluster; + } + + @Override + public synchronized void start() { + if (_isRunning.compareAndSet(false, true)) { + final long topicManagementProcedureInitialDelay = 0; + _executor.scheduleWithFixedDelay( + new TopicManagementRunnable(), + topicManagementProcedureInitialDelay, + _rebalanceIntervalMs, + TimeUnit.MILLISECONDS); + + LOGGER.info("Topic management periodical procedure started with initial delay {} ms and interval {} ms", + topicManagementProcedureInitialDelay, _rebalanceIntervalMs); + + _executor.scheduleWithFixedDelay(new PreferredLeaderElectionRunnable(), _preferredLeaderElectionIntervalMs, + _preferredLeaderElectionIntervalMs, TimeUnit.MILLISECONDS); + LOGGER.info("Preferred leader election periodical procedure started with initial delay {} ms and interval {} ms", + _preferredLeaderElectionIntervalMs, _preferredLeaderElectionIntervalMs); + } + } + + @Override + public synchronized void stop() { + if (_isRunning.compareAndSet(true, false)) { + _executor.shutdown(); + LOGGER.info("{}/MultiClusterTopicManagementService stopped.", _serviceName); + } + } + + @Override + public boolean isRunning() { + return _isRunning.get() && !_executor.isShutdown(); + } + + @Override + public void awaitShutdown(long timeout, TimeUnit unit) { + try { + _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + LOGGER.info("Thread interrupted when waiting for {}/MultiClusterTopicManagementService to shutdown", + _serviceName); + } + LOGGER.info("{}/MultiClusterTopicManagementService shutdown completed", _serviceName); + } + + private class TopicManagementRunnable implements Runnable { + + @Override + public void run() { + try { + for (TopicManagementHelper helper : _topicManagementByCluster.values()) { + helper.maybeCreateTopic(); + } + + /* + * The partition number of the monitor topics should be the minimum partition number that satisfies the following conditions: + * - partition number of the monitor topics across all monitored clusters should be the same + * - partitionNum / brokerNum >= user-configured partitionsToBrokersRatio. + * - partitionNum >= user-configured minPartitionNum + */ + + int minPartitionNum = 0; + for (TopicManagementHelper helper : _topicManagementByCluster.values()) { + minPartitionNum = Math.max(minPartitionNum, helper.minPartitionNum()); + } + for (TopicManagementHelper helper : _topicManagementByCluster.values()) { + helper.maybeAddPartitions(minPartitionNum); + } + + for (Map.Entry entry : _topicManagementByCluster.entrySet()) { + String clusterName = entry.getKey(); + TopicManagementHelper helper = entry.getValue(); + try { + helper.maybeReassignPartitionAndElectLeader(); + } catch (KafkaException e) { + LOGGER.warn(_serviceName + "/MultiClusterTopicManagementService will retry later in cluster " + clusterName, + e); + } + } + } catch (Throwable t) { + // Need to catch throwable because there is scala API that can throw NoSuchMethodError in runtime + // and such error is not caught by compilation + LOGGER.error(_serviceName + "/MultiClusterTopicManagementService will stop due to error.", t); + stop(); + } + } + } + + /** + * Check if Preferred leader election is requested during Topic Management (TopicManagementRunnable), + * trigger Preferred leader election when there is no partition reassignment in progress. + */ + private class PreferredLeaderElectionRunnable implements Runnable { + @Override + public void run() { + try { + for (Map.Entry entry : _topicManagementByCluster.entrySet()) { + String clusterName = entry.getKey(); + TopicManagementHelper helper = entry.getValue(); + try { + helper.maybeElectLeader(); + } catch (KafkaException e) { + LOGGER.warn(_serviceName + "/MultiClusterTopicManagementService will retry later in cluster " + clusterName, + e); + } + } + } catch (Throwable t) { + /* Need to catch throwable because there is scala API that can throw NoSuchMethodError in runtime + and such error is not caught by compilation. */ + LOGGER.error(_serviceName + + "/MultiClusterTopicManagementService/PreferredLeaderElectionRunnable will stop due to an error.", t); + stop(); + } + } + } + + @SuppressWarnings("FieldCanBeLocal") + static class TopicManagementHelper { + private final String _zkConnect; + private final int _replicationFactor; + private final double _minPartitionsToBrokersRatio; + private final int _minPartitionNum; + private final Properties _topicProperties; + private boolean _preferredLeaderElectionRequested; + private final Duration _requestTimeout; + private final List _bootstrapServers; + + // package private for unit testing + boolean _topicCreationEnabled; + boolean _topicAddPartitionEnabled; + boolean _topicReassignPartitionAndElectLeaderEnabled; + AdminClient _adminClient; + String _topic; + TopicFactory _topicFactory; + + @SuppressWarnings("unchecked") + TopicManagementHelper(Map props) throws Exception { + + TopicManagementServiceConfig config = new TopicManagementServiceConfig(props); + AdminClientConfig adminClientConfig = new AdminClientConfig(props); + String topicFactoryClassName = config.getString(TopicManagementServiceConfig.TOPIC_FACTORY_CLASS_CONFIG); + _topicCreationEnabled = config.getBoolean(TopicManagementServiceConfig.TOPIC_CREATION_ENABLED_CONFIG); + _topicAddPartitionEnabled = config.getBoolean(TopicManagementServiceConfig.TOPIC_ADD_PARTITION_ENABLED_CONFIG); + _topicReassignPartitionAndElectLeaderEnabled = config.getBoolean(TopicManagementServiceConfig.TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_CONFIG); + _topic = config.getString(TopicManagementServiceConfig.TOPIC_CONFIG); + _zkConnect = config.getString(TopicManagementServiceConfig.ZOOKEEPER_CONNECT_CONFIG); + _replicationFactor = config.getInt(TopicManagementServiceConfig.TOPIC_REPLICATION_FACTOR_CONFIG); + _minPartitionsToBrokersRatio = config.getDouble(TopicManagementServiceConfig.PARTITIONS_TO_BROKERS_RATIO_CONFIG); + _minPartitionNum = config.getInt(TopicManagementServiceConfig.MIN_PARTITION_NUM_CONFIG); + _preferredLeaderElectionRequested = false; + _requestTimeout = Duration.ofMillis(adminClientConfig.getInt(AdminClientConfig.REQUEST_TIMEOUT_MS_CONFIG)); + _bootstrapServers = adminClientConfig.getList(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG); + _topicProperties = new Properties(); + if (props.containsKey(TopicManagementServiceConfig.TOPIC_PROPS_CONFIG)) { + for (Map.Entry entry : ((Map) props.get( + TopicManagementServiceConfig.TOPIC_PROPS_CONFIG)).entrySet()) { + _topicProperties.put(entry.getKey(), entry.getValue().toString()); + } + } + + Map topicFactoryConfig = + props.containsKey(TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) ? (Map) props.get( + TopicManagementServiceConfig.TOPIC_FACTORY_PROPS_CONFIG) : new HashMap(); + _topicFactory = + (TopicFactory) Class.forName(topicFactoryClassName).getConstructor(Map.class).newInstance(topicFactoryConfig); + _adminClient = constructAdminClient(props); + LOGGER.info("{} configs: {}", _adminClient.getClass().getSimpleName(), props); + logConfigurationValues(); + } + + private void logConfigurationValues() { + LOGGER.info("TopicManagementHelper for cluster with Zookeeper connect {} is configured with " + + "[topic={}, topicCreationEnabled={}, topicAddPartitionEnabled={}, " + + "topicReassignPartitionAndElectLeaderEnabled={}, minPartitionsToBrokersRatio={}, " + + "minPartitionNum={}]", _zkConnect, _topic, _topicCreationEnabled, _topicAddPartitionEnabled, + _topicReassignPartitionAndElectLeaderEnabled, _minPartitionsToBrokersRatio, _minPartitionNum); + } + + @SuppressWarnings("unchecked") + void maybeCreateTopic() throws Exception { + if (!_topicCreationEnabled) { + LOGGER.info("Topic creation is not enabled for {} in a cluster with Zookeeper URL {}. " + + "Refer to config: {}", _topic, _zkConnect, TopicManagementServiceConfig.TOPIC_CREATION_ENABLED_CONFIG); + return; + } + NewTopic newTopic = new NewTopic(_topic, minPartitionNum(), (short) _replicationFactor); + newTopic.configs((Map) _topicProperties); + _topicFactory.createTopicIfNotExist(_topic, (short) _replicationFactor, _minPartitionsToBrokersRatio, + _topicProperties, _adminClient); + } + + AdminClient constructAdminClient(Map props) { + return AdminClient.create(props); + } + + int minPartitionNum() throws InterruptedException, ExecutionException { + int brokerCount = _adminClient.describeCluster().nodes().get().size(); + return Math.max((int) Math.ceil(_minPartitionsToBrokersRatio * brokerCount), _minPartitionNum); + } + + void maybeAddPartitions(final int requiredMinPartitionNum) + throws ExecutionException, InterruptedException, CancellationException, TimeoutException { + if (!_topicAddPartitionEnabled) { + LOGGER.info("Adding partition to {} topic is not enabled in a cluster with Zookeeper URL {}. " + + "Refer to config: {}", _topic, _zkConnect, TopicManagementServiceConfig.TOPIC_ADD_PARTITION_ENABLED_CONFIG); + return; + } + Map> kafkaFutureMap = + _adminClient.describeTopics(Collections.singleton(_topic)).values(); + KafkaFuture topicDescriptions = kafkaFutureMap.get(_topic); + List partitions = topicDescriptions.get(_requestTimeout.toMillis(), TimeUnit.MILLISECONDS).partitions(); + + final int currPartitionNum = partitions.size(); + if (currPartitionNum >= requiredMinPartitionNum) { + LOGGER.debug("{} will not increase partition of the topic {} in the cluster. Current partition count {} and '" + + "minimum required partition count is {}.", this.getClass().toString(), _topic, currPartitionNum, requiredMinPartitionNum); + return; + } + LOGGER.info("{} will increase partition of the topic {} in the cluster from {}" + " to {}.", + this.getClass().toString(), _topic, currPartitionNum, requiredMinPartitionNum); + Set brokers = new HashSet<>(); + for (Node broker : _adminClient.describeCluster().nodes().get(_requestTimeout.toMillis(), TimeUnit.MILLISECONDS)) { + BrokerMetadata brokerMetadata = new BrokerMetadata(broker.id(), null); + brokers.add(brokerMetadata); + } + Set excludedBrokers = _topicFactory.getExcludedBrokers(_adminClient); + if (!excludedBrokers.isEmpty()) { + brokers.removeIf(broker -> excludedBrokers.contains(broker.id())); + } + + List> newPartitionAssignments = + newPartitionAssignments(requiredMinPartitionNum, currPartitionNum, brokers, _replicationFactor); + + NewPartitions newPartitions = NewPartitions.increaseTo(requiredMinPartitionNum, newPartitionAssignments); + + Map newPartitionsMap = new HashMap<>(); + newPartitionsMap.put(_topic, newPartitions); + _adminClient.createPartitions(newPartitionsMap).all().get(_requestTimeout.toMillis(), TimeUnit.MILLISECONDS); + LOGGER.info("{} finished increasing partition of the topic {} in the cluster from {} to {}", + this.getClass().toString(), _topic, currPartitionNum, requiredMinPartitionNum); + } + + static List> newPartitionAssignments(int minPartitionNum, int partitionNum, + Set brokers, int rf) { + + // The replica assignments for the new partitions, and not the old partitions. + // .increaseTo(6, asList(asList(1, 2), + // asList(2, 3), + // asList(3, 1))) + // partition 3's preferred leader will be broker 1, + // partition 4's preferred leader will be broker 2 and + // partition 5's preferred leader will be broker 3. + List> newPartitionAssignments = new ArrayList<>(); + int partitionDifference = minPartitionNum - partitionNum; + + // leader assignments - + while (newPartitionAssignments.size() != partitionDifference) { + List replicas = new ArrayList<>(); + // leader replica/broker - + int brokerMetadata = randomBroker(brokers).id(); + replicas.add(brokerMetadata); + + newPartitionAssignments.add(replicas); + } + + // follower assignments - + // Regardless of the partition/replica assignments here, maybeReassignPartitionAndElectLeader() + // will reassign the partition as needed periodically. + for (List replicas : newPartitionAssignments) { + for (BrokerMetadata broker : brokers) { + if (!replicas.contains(broker.id())) { + replicas.add(broker.id()); + } + if (replicas.size() == rf) { + break; + } + } + } + return newPartitionAssignments; + } + + private static BrokerMetadata randomBroker(Set brokers) { + + if (brokers == null || brokers.size() == 0) { + throw new IllegalArgumentException("brokers object is either null or empty."); + } + + // Using Set enforces the usage of loop which is O(n). + // As the list of brokers does not change in newPartitionAssignments, + // the acceptance of a List argument instead of a Set will be faster which is (O(1)) + List brokerMetadataList = new ArrayList<>(brokers); + // convert to a list so there's no need to create a index and iterate through this set + //addAll() is replaced with parameterized constructor call for better performance.. + + int brokerSetSize = brokers.size(); + + // In practicality, the Random object should be rather more shared than this. + int random = new Random().nextInt(brokerSetSize); + + return brokerMetadataList.get(random); + } + + /** + * Exposed package-private access for testing. Get the total number of partitions for a Kafka topic. + * @return total number of topic partitions + * @throws InterruptedException when a thread is waiting, sleeping and the thread is interrupted, either before / during the activity. + * @throws ExecutionException when attempting to retrieve the result of a task that aborted by throwing an exception. + */ + int numPartitions() throws InterruptedException, ExecutionException { + + return _adminClient.describeTopics(Collections.singleton(_topic)).values().get(_topic).get().partitions().size(); + } + + private Set getAvailableBrokers() throws ExecutionException, InterruptedException { + Set brokers = new HashSet<>(_adminClient.describeCluster().nodes().get()); + Set excludedBrokers = _topicFactory.getExcludedBrokers(_adminClient); + brokers.removeIf(broker -> excludedBrokers.contains(broker.id())); + return brokers; + } + + void maybeReassignPartitionAndElectLeader() throws ExecutionException, InterruptedException, TimeoutException { + if (!_topicReassignPartitionAndElectLeaderEnabled) { + LOGGER.info("Reassign partition and elect leader to {} topic is not enabled in a cluster with Zookeeper URL {}. " + + "Refer to config: {}", _topic, _zkConnect, TopicManagementServiceConfig.TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_CONFIG); + return; + } + List partitionInfoList = + _adminClient.describeTopics(Collections.singleton(_topic)).all().get().get(_topic).partitions(); + Collection brokers = this.getAvailableBrokers(); + boolean partitionReassigned = false; + if (partitionInfoList.size() == 0) { + throw new IllegalStateException("Topic " + _topic + " does not exist in cluster."); + } + + int currentReplicationFactor = getReplicationFactor(partitionInfoList); + int expectedReplicationFactor = Math.max(currentReplicationFactor, _replicationFactor); + + if (_replicationFactor < currentReplicationFactor) { + LOGGER.debug( + "Configured replication factor {} is smaller than the current replication factor {} of the topic {} in cluster.", + _replicationFactor, currentReplicationFactor, _topic); + } + + if (expectedReplicationFactor > currentReplicationFactor && Utils.ongoingPartitionReassignments(_adminClient) + .isEmpty()) { + + LOGGER.info( + "MultiClusterTopicManagementService will increase the replication factor of the topic {} in cluster" + + "from {} to {}", _topic, currentReplicationFactor, expectedReplicationFactor); + reassignPartitions(_adminClient, brokers, _topic, partitionInfoList.size(), expectedReplicationFactor); + + partitionReassigned = true; + } + + // Update the properties of the monitor topic if any config is different from the user-specified config + ConfigResource topicConfigResource = new ConfigResource(ConfigResource.Type.TOPIC, _topic); + Config currentConfig = _adminClient.describeConfigs(Collections.singleton(topicConfigResource)).all().get().get(topicConfigResource); + Collection alterConfigOps = new ArrayList<>(); + for (Map.Entry entry : _topicProperties.entrySet()) { + String name = String.valueOf(entry.getKey()); + ConfigEntry configEntry = new ConfigEntry(name, String.valueOf(entry.getValue())); + if (!configEntry.equals(currentConfig.get(name))) { + alterConfigOps.add(new AlterConfigOp(configEntry, AlterConfigOp.OpType.SET)); + } + } + + if (!alterConfigOps.isEmpty()) { + LOGGER.info("MultiClusterTopicManagementService will overwrite properties of the topic {} " + + "in cluster with {}.", _topic, alterConfigOps); + Map> configs = Collections.singletonMap(topicConfigResource, alterConfigOps); + _adminClient.incrementalAlterConfigs(configs); + } + + if (partitionInfoList.size() >= brokers.size() && someBrokerNotPreferredLeader(partitionInfoList, brokers) + && Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) { + LOGGER.info("{} will reassign partitions of the topic {} in cluster.", this.getClass().toString(), _topic); + reassignPartitions(_adminClient, brokers, _topic, partitionInfoList.size(), expectedReplicationFactor); + + partitionReassigned = true; + } + + if (partitionInfoList.size() >= brokers.size() && someBrokerNotElectedLeader(partitionInfoList, brokers)) { + if (!partitionReassigned || Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) { + LOGGER.info("MultiClusterTopicManagementService will trigger preferred leader election for the topic {} in " + + "cluster.", _topic); + triggerPreferredLeaderElection(partitionInfoList, _topic); + _preferredLeaderElectionRequested = false; + } else { + _preferredLeaderElectionRequested = true; + } + } + } + + void maybeElectLeader() throws InterruptedException, ExecutionException, TimeoutException { + if (!_preferredLeaderElectionRequested) { + return; + } + + if (Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) { + List partitionInfoList = + _adminClient.describeTopics(Collections.singleton(_topic)).all().get().get(_topic).partitions(); + LOGGER.info("MultiClusterTopicManagementService will trigger requested preferred leader election for the" + + " topic {} in cluster.", _topic); + triggerPreferredLeaderElection(partitionInfoList, _topic); + _preferredLeaderElectionRequested = false; + } + } + + private void triggerPreferredLeaderElection(List partitionInfoList, String partitionTopic) { + Set partitions = new HashSet<>(); + for (TopicPartitionInfo javaPartitionInfo : partitionInfoList) { + partitions.add(new TopicPartition(partitionTopic, javaPartitionInfo.partition())); + } + ElectLeadersResult electLeadersResult = _adminClient.electLeaders(ElectionType.PREFERRED, partitions); + + LOGGER.info("{}: triggerPreferredLeaderElection - {}", this.getClass().toString(), + electLeadersResult.all()); + } + + private static void reassignPartitions(AdminClient adminClient, Collection brokers, String topic, + int partitionCount, int replicationFactor) { + + scala.collection.mutable.ArrayBuffer brokersMetadata = + new scala.collection.mutable.ArrayBuffer<>(brokers.size()); + for (Node broker : brokers) { + brokersMetadata.$plus$eq(new BrokerMetadata(broker.id(), Option$.MODULE$.apply(broker.rack()))); + } + scala.collection.Map> assignedReplicas = + AdminUtils.assignReplicasToBrokers(brokersMetadata, partitionCount, replicationFactor, 0, 0); + scala.collection.immutable.Map> newAssignment = + new scala.collection.immutable.HashMap<>(); + scala.collection.Iterator>> it = assignedReplicas.iterator(); + while (it.hasNext()) { + scala.Tuple2> scalaTuple = it.next(); + TopicPartition tp = new TopicPartition(topic, (Integer) scalaTuple._1); + newAssignment = newAssignment.$plus(new scala.Tuple2<>(tp, scalaTuple._2)); + } + + String newAssignmentJson = formatAsNewReassignmentJson(topic, assignedReplicas); + LOGGER.info("Reassign partitions for topic " + topic); + LOGGER.info("New topic partition replica assignments: {}", newAssignmentJson); + + Set>> newAssignmentMap = + scala.collection.JavaConverters.mapAsJavaMap(newAssignment).entrySet(); + Map> reassignments = new HashMap<>(); + for (Map.Entry> topicPartitionSeqEntry : newAssignmentMap) { + List targetReplicas = new ArrayList<>(); + List replicas = JavaConverters.seqAsJavaList(topicPartitionSeqEntry.getValue()); + for (Object replica : replicas) { + targetReplicas.add((int) replica); + } + NewPartitionReassignment newPartitionReassignment = new NewPartitionReassignment(targetReplicas); + reassignments.put(topicPartitionSeqEntry.getKey(), Optional.of(newPartitionReassignment)); + } + + AlterPartitionReassignmentsResult alterPartitionReassignmentsResult = + adminClient.alterPartitionReassignments(reassignments); + try { + alterPartitionReassignmentsResult.all().get(); + } catch (InterruptedException | ExecutionException e) { + + LOGGER.error("An exception occurred while altering the partition reassignments for {}", topic, e); + } + } + + static int getReplicationFactor(List partitionInfoList) { + if (partitionInfoList.isEmpty()) { + throw new RuntimeException("Partition list is empty."); + } + + int replicationFactor = partitionInfoList.get(0).replicas().size(); + for (TopicPartitionInfo partitionInfo : partitionInfoList) { + if (replicationFactor != partitionInfo.replicas().size()) { + LOGGER.warn("Partitions of the topic have different replication factor."); + return -1; + } + } + return replicationFactor; + } + + static boolean someBrokerNotPreferredLeader(List partitionInfoList, Collection brokers) { + Set brokersNotPreferredLeader = new HashSet<>(brokers.size()); + for (Node broker : brokers) { + brokersNotPreferredLeader.add(broker.id()); + } + for (TopicPartitionInfo partitionInfo : partitionInfoList) { + brokersNotPreferredLeader.remove(partitionInfo.replicas().get(0).id()); + } + + return !brokersNotPreferredLeader.isEmpty(); + } + + static boolean someBrokerNotElectedLeader(List partitionInfoList, Collection brokers) { + Set brokersNotElectedLeader = new HashSet<>(brokers.size()); + for (Node broker : brokers) { + brokersNotElectedLeader.add(broker.id()); + } + for (TopicPartitionInfo partitionInfo : partitionInfoList) { + if (partitionInfo.leader() != null) { + brokersNotElectedLeader.remove(partitionInfo.leader().id()); + } + } + return !brokersNotElectedLeader.isEmpty(); + } + + /** + * @param topic topic + * @param partitionsToBeReassigned a map from partition (int) to replica list (int seq) + * + * @return a json string with the same format as output of kafka.utils.ZkUtils.formatAsReassignmentJson + * + * Example: + *
+     *   {"version":1,"partitions":[
+     *     {"topic":"kmf-topic","partition":1,"replicas":[0,1]},
+     *     {"topic":"kmf-topic","partition":2,"replicas":[1,2]},
+     *     {"topic":"kmf-topic","partition":0,"replicas":[2,0]}]}
+     * 
+ */ + + // TODO (andrewchoi5): uncomment this method when Xinfra Monitor is upgraded to 'org.apache.kafka' 'kafka_2.12' version '2.4.1' +// private static String formatAsOldAssignmentJson(String topic, scala.collection.Map partitionsToBeReassigned) { +// StringBuilder bldr = new StringBuilder(); +// bldr.append("{\"version\":1,\"partitions\":[\n"); +// for (int partition = 0; partition < partitionsToBeReassigned.size(); partition++) { +// bldr.append(" {\"topic\":\"").append(topic).append("\",\"partition\":").append(partition).append(",\"replicas\":["); +// ReplicaAssignment replicas = partitionsToBeReassigned.apply(partition); +// for (int replicaIndex = 0; replicaIndex < replicas.replicas().size(); replicaIndex++) { +// Object replica = replicas.replicas().apply(replicaIndex); +// bldr.append(replica).append(","); +// } +// bldr.setLength(bldr.length() - 1); +// bldr.append("]},\n"); +// } +// bldr.setLength(bldr.length() - 2); +// bldr.append("]}"); +// return bldr.toString(); +// } + + /** + * @param topic Kafka topic + * @param partitionsToReassign a map from partition (int) to new replica list (int seq) + * + * @return a json string with the same format as output of kafka.utils.ZkUtils.formatAsReassignmentJson + * + * Example: + *
+     *   {"version":1,"partitions":[
+     *     {"topic":"kmf-topic","partition":1,"replicas":[0,1]},
+     *     {"topic":"kmf-topic","partition":2,"replicas":[1,2]},
+     *     {"topic":"kmf-topic","partition":0,"replicas":[2,0]}]}
+     * 
+ */ + private static String formatAsNewReassignmentJson(String topic, + scala.collection.Map> partitionsToReassign) { + StringBuilder builder = new StringBuilder(); + builder.append("{\"version\":1,\"partitions\":[\n"); + for (int partition = 0; partition < partitionsToReassign.size(); partition++) { + builder.append(" {\"topic\":\"") + .append(topic) + .append("\",\"partition\":") + .append(partition) + .append(",\"replicas\":["); + Seq replicas = partitionsToReassign.apply(partition); + for (int replicaIndex = 0; replicaIndex < replicas.size(); replicaIndex++) { + Object replica = replicas.apply(replicaIndex); + builder.append(replica).append(","); + } + builder.setLength(builder.length() - 1); + builder.append("]},\n"); + } + builder.setLength(builder.length() - 2); + builder.append("]}"); + return builder.toString(); + } + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceFactory.java new file mode 100644 index 00000000..9e90169a --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceFactory.java @@ -0,0 +1,36 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory which instantiates a MultiClusterTopicManagementService service object. + */ +@SuppressWarnings("rawtypes") +public class MultiClusterTopicManagementServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + public MultiClusterTopicManagementServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + return new MultiClusterTopicManagementService(_properties, _serviceName); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitService.java b/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitService.java new file mode 100644 index 00000000..42f76a0d --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitService.java @@ -0,0 +1,278 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.linkedin.xinfra.monitor.XinfraMonitorConstants; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.metrics.OffsetCommitServiceMetrics; +import java.net.InetSocketAddress; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.kafka.clients.ApiVersions; +import org.apache.kafka.clients.ClientDnsLookup; +import org.apache.kafka.clients.ClientResponse; +import org.apache.kafka.clients.ClientUtils; +import org.apache.kafka.clients.KafkaClient; +import org.apache.kafka.clients.Metadata; +import org.apache.kafka.clients.NetworkClient; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient; +import org.apache.kafka.clients.consumer.internals.RequestFuture; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.internals.ClusterResourceListeners; +import org.apache.kafka.common.message.OffsetCommitRequestData; +import org.apache.kafka.common.metrics.JmxReporter; +import org.apache.kafka.common.metrics.MetricConfig; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.MetricsReporter; +import org.apache.kafka.common.network.ChannelBuilder; +import org.apache.kafka.common.network.Selector; +import org.apache.kafka.common.requests.AbstractRequest; +import org.apache.kafka.common.requests.OffsetCommitRequest; +import org.apache.kafka.common.utils.LogContext; +import org.apache.kafka.common.utils.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Service that monitors the commit offset availability of a particular Consumer Group. + */ +public class OffsetCommitService implements Service { + + public static final String METRIC_GRP_PREFIX = "xm-offset-commit-service"; + private static final int MAX_INFLIGHT_REQUESTS_PER_CONNECTION = 100; + private static final Logger LOGGER = LoggerFactory.getLogger(OffsetCommitService.class); + private static final String SERVICE_SUFFIX = "-consumer-offset-commit-service"; + private final AtomicBoolean _isRunning; + private final ScheduledExecutorService _scheduledExecutorService; + private final String _serviceName; + private final AdminClient _adminClient; + private final String _consumerGroup; + + // the consumer network client that communicates with kafka cluster brokers. + private final ConsumerNetworkClient _consumerNetworkClient; + private final Time _time; + private final OffsetCommitServiceMetrics _offsetCommitServiceMetrics; + + /** + * + * @param config The consumer configuration keys + * @param serviceName name of the xinfra monitor service + * @param adminClient Administrative client for Kafka, which supports managing and inspecting topics, brokers, configurations and ACLs. + */ + OffsetCommitService(ConsumerConfig config, String serviceName, AdminClient adminClient) + throws JsonProcessingException { + + _time = Time.SYSTEM; + _consumerGroup = config.getString(ConsumerConfig.GROUP_ID_CONFIG); + _adminClient = adminClient; + _isRunning = new AtomicBoolean(false); + _serviceName = serviceName; + + List reporters = new ArrayList<>(); + reporters.add(new JmxReporter(Service.JMX_PREFIX)); + MetricConfig metricConfig = new MetricConfig().samples(60).timeWindow(1000, TimeUnit.MILLISECONDS); + Metrics metrics = new Metrics(metricConfig, reporters, _time); + Map tags = new HashMap<>(); + tags.put(XinfraMonitorConstants.TAGS_NAME, serviceName); + + _offsetCommitServiceMetrics = new OffsetCommitServiceMetrics(metrics, tags); + + long retryBackoffMs = config.getLong(ConsumerConfig.RETRY_BACKOFF_MS_CONFIG); + int heartbeatIntervalMs = config.getInt(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG); + + String clientId = config.getString(ConsumerConfig.CLIENT_ID_CONFIG); + + List bootstrapServers = config.getList(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG); + List addresses = + ClientUtils.parseAndValidateAddresses(bootstrapServers, ClientDnsLookup.DEFAULT); + + LogContext logContext = new LogContext("[Consumer clientId=" + clientId + "] "); + + ChannelBuilder channelBuilder = ClientUtils.createChannelBuilder(config, _time, logContext); + + LOGGER.info("Bootstrap servers config: {} | broker addresses: {}", bootstrapServers, addresses); + + Metadata metadata = new Metadata(retryBackoffMs, config.getLong(ConsumerConfig.METADATA_MAX_AGE_CONFIG), logContext, + new ClusterResourceListeners()); + + metadata.bootstrap(addresses); + + Selector selector = + new Selector(config.getLong(ConsumerConfig.CONNECTIONS_MAX_IDLE_MS_CONFIG), new Metrics(), _time, + METRIC_GRP_PREFIX, channelBuilder, logContext); + + KafkaClient kafkaClient = new NetworkClient( + selector, metadata, clientId, MAX_INFLIGHT_REQUESTS_PER_CONNECTION, + config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG), + config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MAX_MS_CONFIG), + config.getInt(ConsumerConfig.SEND_BUFFER_CONFIG), config.getInt(ConsumerConfig.RECEIVE_BUFFER_CONFIG), + config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG), + config.getLong(ConsumerConfig.SOCKET_CONNECTION_SETUP_TIMEOUT_MS_CONFIG), config.getLong(ConsumerConfig.SOCKET_CONNECTION_SETUP_TIMEOUT_MAX_MS_CONFIG), + ClientDnsLookup.DEFAULT, _time, true, + new ApiVersions(), logContext); + + + LOGGER.debug("The network client active: {}", kafkaClient.active()); + LOGGER.debug("The network client has in flight requests: {}", kafkaClient.hasInFlightRequests()); + LOGGER.debug("The network client in flight request count: {}", kafkaClient.inFlightRequestCount()); + + _consumerNetworkClient = new ConsumerNetworkClient(logContext, kafkaClient, metadata, _time, retryBackoffMs, + config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG), heartbeatIntervalMs); + + ThreadFactory threadFactory = new ThreadFactory() { + @Override + public Thread newThread(Runnable runnable) { + return new Thread(runnable, serviceName + SERVICE_SUFFIX); + } + }; + _scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(threadFactory); + + LOGGER.info("OffsetCommitService's ConsumerConfig - {}", Utils.prettyPrint(config.values())); + } + + /** + * The start logic must only execute once. If an error occurs then the implementer of this class must assume that + * stop() will be called to clean up. This method must be thread safe and must assume that stop() may be called + * concurrently. This can happen if the monitoring application's life cycle is being managed by a container. Start + * will only be called once. + */ + @Override + public void start() { + if (_isRunning.compareAndSet(false, true)) { + + Runnable runnable = new OffsetCommitServiceRunnable(); + _scheduledExecutorService.scheduleWithFixedDelay(runnable, 1, 2, TimeUnit.SECONDS); + LOGGER.info("Scheduled the offset commit service executor."); + } + } + + private class OffsetCommitServiceRunnable implements Runnable { + @Override + public void run() { + try { + sendOffsetCommitRequest(_consumerNetworkClient, _adminClient, _consumerGroup); + } catch (ExecutionException | InterruptedException e) { + LOGGER.error("OffsetCommitServiceRunnable class encountered an exception: ", e); + } + } + } + + /** + * + * @param consumerNetworkClient Kafka consumer network client. Higher level consumer access + * to the network layer with basic support for request futures. + * @param adminClient admin client object + * @param consumerGroup consumer group name + * @throws ExecutionException when attempting to retrieve the result of a task that aborted by throwing an exception + * @throws InterruptedException Thrown when the thread is waiting, sleeping, or otherwise occupied, + * and the thread is interrupted, either before or during the activity. + */ + private void sendOffsetCommitRequest(ConsumerNetworkClient consumerNetworkClient, AdminClient adminClient, + String consumerGroup) throws ExecutionException, InterruptedException, RuntimeException { + + + LOGGER.trace("Consumer groups available: {}", adminClient.listConsumerGroups().all().get()); + + Node groupCoordinator = adminClient.describeConsumerGroups(Collections.singleton(consumerGroup)) + .all() + .get() + .get(consumerGroup) + .coordinator(); + LOGGER.trace("Consumer group {} coordinator {}, consumer group {}", consumerGroup, groupCoordinator, consumerGroup); + + consumerNetworkClient.tryConnect(groupCoordinator); + consumerNetworkClient.maybeTriggerWakeup(); + + OffsetCommitRequestData offsetCommitRequestData = new OffsetCommitRequestData(); + AbstractRequest.Builder offsetCommitRequestBuilder = new OffsetCommitRequest.Builder(offsetCommitRequestData); + + LOGGER.debug("pending request count: {}", consumerNetworkClient.pendingRequestCount()); + + RequestFuture future = consumerNetworkClient.send(groupCoordinator, offsetCommitRequestBuilder); + + if (consumerNetworkClient.isUnavailable(groupCoordinator)) { + _offsetCommitServiceMetrics.recordUnavailable(); + throw new RuntimeException("Unavailable consumerNetworkClient for " + groupCoordinator); + } else { + LOGGER.trace("The consumerNetworkClient is available for {}", groupCoordinator); + if (consumerNetworkClient.hasPendingRequests()) { + + boolean consumerNetworkClientPollResult = + consumerNetworkClient.poll(future, _time.timer(Duration.ofSeconds(5).toMillis())); + LOGGER.debug("result of poll {}", consumerNetworkClientPollResult); + + if (future.failed() && !future.isRetriable()) { + _offsetCommitServiceMetrics.recordFailed(); + throw future.exception(); + } + + if (future.succeeded() && future.isDone() && consumerNetworkClientPollResult) { + + ClientResponse clientResponse = future.value(); + + _offsetCommitServiceMetrics.recordSuccessful(); + LOGGER.info("ClientResponseRequestFuture value {} for coordinator {} and consumer group {}", clientResponse, + groupCoordinator, consumerGroup); + } + } + } + } + + /** + * This may be called multiple times. This method must be thread safe and must assume that start() may be called + * concurrently. This can happen if the monitoring application's life cycle is being managed by a container. + * Implementations must be non-blocking and should release the resources acquired by the service during start(). + */ + @Override + public void stop() { + if (_isRunning.compareAndSet(true, false)) { + _scheduledExecutorService.shutdown(); + } + } + + /** + * Implementations of this method must be thread safe as it can be called at any time. Implementations must be + * non-blocking. + * @return true if this start() has returned successfully else this must return false. This must also return false if + * the service can no longer perform its function. + */ + @Override + public boolean isRunning() { + return _isRunning.get() && !_scheduledExecutorService.isShutdown(); + } + + /** + * Implementations of this method must be thread safe and must be blocking. + */ + @Override + public void awaitShutdown(long timeout, TimeUnit unit) { + try { + _scheduledExecutorService.awaitTermination(timeout, unit); + } catch (InterruptedException interruptedException) { + LOGGER.error("Thread interrupted when waiting for {} to shutdown.", _serviceName, interruptedException); + } + LOGGER.info("{} shutdown completed.", _serviceName); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceFactory.java new file mode 100644 index 00000000..87bd4f88 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceFactory.java @@ -0,0 +1,84 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.linkedin.xinfra.monitor.XinfraMonitorConstants; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.CommonServiceConfig; +import java.util.Map; +import java.util.Properties; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Factory for OffsetCommitService + */ +@SuppressWarnings("rawtypes") +public class OffsetCommitServiceFactory implements ServiceFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(OffsetCommitServiceFactory.class); + private final Map _properties; + private final String _serviceName; + + public OffsetCommitServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws JsonProcessingException { + LOGGER.info("Creating OffsetCommitService..."); + AdminClient adminClient = AdminClient.create(_properties); + + Properties preparedProps = this.prepareConfigs(_properties); + ConsumerConfig consumerConfig = new ConsumerConfig(preparedProps); + LOGGER.info("OffsetCommitServiceFactory consumer config {}", Utils.prettyPrint(consumerConfig.values())); + + return new OffsetCommitService(consumerConfig, _serviceName, adminClient); + } + + /** + * populate configs for kafka client + * @param props Map of String to Object + * @return Properties + */ + @SuppressWarnings("unchecked") + private Properties prepareConfigs(Map props) { + + String zkConnect = (String) props.get(CommonServiceConfig.ZOOKEEPER_CONNECT_CONFIG); + String brokerList = (String) props.get(CommonServiceConfig.BOOTSTRAP_SERVERS_CONFIG); + + Properties consumerProps = new Properties(); + consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, XinfraMonitorConstants.FALSE); + consumerProps.put(ConsumerConfig.CLIENT_ID_CONFIG, XinfraMonitorConstants.XINFRA_MONITOR_PREFIX + _serviceName); + consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + consumerProps.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); + consumerProps.put(CommonServiceConfig.ZOOKEEPER_CONNECT_CONFIG, zkConnect); + + Map customProps = (Map) props.get(CommonServiceConfig.CONSUMER_PROPS_CONFIG); + if (customProps != null) { + for (Map.Entry entry : customProps.entrySet()) { + consumerProps.put(entry.getKey(), entry.getValue()); + } + } + + return consumerProps; + } +} diff --git a/src/main/java/com/linkedin/kmf/services/ProduceService.java b/src/main/java/com/linkedin/xinfra/monitor/services/ProduceService.java similarity index 58% rename from src/main/java/com/linkedin/kmf/services/ProduceService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/ProduceService.java index 8e2e1290..28e49242 100644 --- a/src/main/java/com/linkedin/kmf/services/ProduceService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ProduceService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,21 +7,25 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; - -import com.linkedin.kmf.common.Utils; -import com.linkedin.kmf.partitioner.KMPartitioner; -import com.linkedin.kmf.producer.BaseProducerRecord; -import com.linkedin.kmf.producer.KMBaseProducer; -import com.linkedin.kmf.producer.NewProducer; -import com.linkedin.kmf.services.configs.ProduceServiceConfig; +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.partitioner.KMPartitioner; +import com.linkedin.xinfra.monitor.producer.BaseProducerRecord; +import com.linkedin.xinfra.monitor.producer.KMBaseProducer; +import com.linkedin.xinfra.monitor.producer.NewProducer; +import com.linkedin.xinfra.monitor.services.configs.ProduceServiceConfig; +import com.linkedin.xinfra.monitor.services.metrics.ProduceMetrics; +import java.time.Duration; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadFactory; @@ -29,34 +33,30 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.TopicDescription; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.RecordMetadata; -import org.apache.kafka.common.MetricName; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.common.metrics.JmxReporter; -import org.apache.kafka.common.metrics.Measurable; import org.apache.kafka.common.metrics.MetricConfig; import org.apache.kafka.common.metrics.Metrics; import org.apache.kafka.common.metrics.MetricsReporter; -import org.apache.kafka.common.metrics.Sensor; -import org.apache.kafka.common.metrics.stats.Rate; -import org.apache.kafka.common.metrics.stats.Total; import org.apache.kafka.common.utils.SystemTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ProduceService implements Service { +@SuppressWarnings("rawtypes") +public class ProduceService extends AbstractService { private static final Logger LOG = LoggerFactory.getLogger(ProduceService.class); - private static final String METRIC_GROUP_NAME = "produce-service"; - private static final String[] NONOVERRIDABLE_PROPERTIES = new String[]{ + private static final String[] NON_OVERRIDABLE_PROPERTIES = new String[]{ ProduceServiceConfig.BOOTSTRAP_SERVERS_CONFIG, ProduceServiceConfig.ZOOKEEPER_CONNECT_CONFIG }; - private final String _name; private final ProduceMetrics _sensors; private KMBaseProducer _producer; - private KMPartitioner _partitioner; + private final KMPartitioner _partitioner; private ScheduledExecutorService _produceExecutor; private final ScheduledExecutorService _handleNewPartitionsExecutor; private final int _produceDelayMs; @@ -73,15 +73,18 @@ public class ProduceService implements Service { private final Map _producerPropsOverride; private final String _producerClassName; private final int _threadsNum; - private final String _zkConnect; + private final AdminClient _adminClient; + private static final String KEY_SERIALIZER_CLASS = "org.apache.kafka.common.serialization.StringSerializer"; public ProduceService(Map props, String name) throws Exception { + // TODO: Make values of below fields come from configs + super(10, Duration.ofMinutes(1)); _name = name; ProduceServiceConfig config = new ProduceServiceConfig(props); - _zkConnect = config.getString(ProduceServiceConfig.ZOOKEEPER_CONNECT_CONFIG); _brokerList = config.getString(ProduceServiceConfig.BOOTSTRAP_SERVERS_CONFIG); String producerClass = config.getString(ProduceServiceConfig.PRODUCER_CLASS_CONFIG); - + int latencyPercentileMaxMs = config.getInt(ProduceServiceConfig.LATENCY_PERCENTILE_MAX_MS_CONFIG); + int latencyPercentileGranularityMs = config.getInt(ProduceServiceConfig.LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG); _partitioner = config.getConfiguredInstance(ProduceServiceConfig.PARTITIONER_CLASS_CONFIG, KMPartitioner.class); _threadsNum = config.getInt(ProduceServiceConfig.PRODUCE_THREAD_NUM_CONFIG); _topic = config.getString(ProduceServiceConfig.TOPIC_CONFIG); @@ -89,19 +92,21 @@ public ProduceService(Map props, String name) throws Exception { _produceDelayMs = config.getInt(ProduceServiceConfig.PRODUCE_RECORD_DELAY_MS_CONFIG); _recordSize = config.getInt(ProduceServiceConfig.PRODUCE_RECORD_SIZE_BYTE_CONFIG); _sync = config.getBoolean(ProduceServiceConfig.PRODUCE_SYNC_CONFIG); + boolean treatZeroThroughputAsUnavailable = + config.getBoolean(ProduceServiceConfig.PRODUCER_TREAT_ZERO_THROUGHPUT_AS_UNAVAILABLE_CONFIG); _partitionNum = new AtomicInteger(0); _running = new AtomicBoolean(false); _nextIndexPerPartition = new ConcurrentHashMap<>(); _producerPropsOverride = props.containsKey(ProduceServiceConfig.PRODUCER_PROPS_CONFIG) ? (Map) props.get(ProduceServiceConfig.PRODUCER_PROPS_CONFIG) : new HashMap<>(); - for (String property: NONOVERRIDABLE_PROPERTIES) { + for (String property: NON_OVERRIDABLE_PROPERTIES) { if (_producerPropsOverride.containsKey(property)) { throw new ConfigException("Override must not contain " + property + " config."); } } - _partitionNum.set(Utils.getPartitionNumForTopic(_zkConnect, _topic)); + _adminClient = AdminClient.create(props); if (producerClass.equals(NewProducer.class.getCanonicalName()) || producerClass.equals(NewProducer.class.getSimpleName())) { _producerClassName = NewProducer.class.getCanonicalName(); @@ -109,7 +114,7 @@ public ProduceService(Map props, String name) throws Exception { _producerClassName = producerClass; } - initializeProducer(); + initializeProducer(props); _produceExecutor = Executors.newScheduledThreadPool(_threadsNum, new ProduceServiceThreadFactory()); _handleNewPartitionsExecutor = Executors.newSingleThreadScheduledExecutor(new HandleNewPartitionsThreadFactory()); @@ -120,27 +125,31 @@ public ProduceService(Map props, String name) throws Exception { Metrics metrics = new Metrics(metricConfig, reporters, new SystemTime()); Map tags = new HashMap<>(); tags.put("name", _name); - _sensors = new ProduceMetrics(metrics, tags); + _sensors = + new ProduceMetrics(metrics, tags, latencyPercentileGranularityMs, latencyPercentileMaxMs, _partitionNum, + treatZeroThroughputAsUnavailable); } - - private void initializeProducer() throws Exception { - + private void initializeProducer(Map props) throws Exception { Properties producerProps = new Properties(); // Assign default config. This has the lowest priority. producerProps.put(ProducerConfig.ACKS_CONFIG, "-1"); producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, "20000"); - producerProps.put(ProducerConfig.RETRIES_CONFIG, 3); - producerProps.put(ProducerConfig.BLOCK_ON_BUFFER_FULL_CONFIG, "true"); + producerProps.put(ProducerConfig.RETRIES_CONFIG, "3"); + producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, Long.MAX_VALUE); producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1"); - producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); - producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, KEY_SERIALIZER_CLASS); + producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KEY_SERIALIZER_CLASS); // Assign config specified for ProduceService. producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, _producerId); producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, _brokerList); // Assign config specified for producer. This has the highest priority. producerProps.putAll(_producerPropsOverride); + if (props.containsKey(ProduceServiceConfig.PRODUCER_PROPS_CONFIG)) { + props.forEach(producerProps::putIfAbsent); + } + _producer = (KMBaseProducer) Class.forName(_producerClassName).getConstructor(Properties.class).newInstance(producerProps); LOG.info("{}/ProduceService is initialized.", _name); } @@ -148,28 +157,29 @@ private void initializeProducer() throws Exception { @Override public synchronized void start() { if (_running.compareAndSet(false, true)) { - initializeStateForPartitions(); - _handleNewPartitionsExecutor.scheduleWithFixedDelay(new NewPartitionHandler(), 1000, 30000, TimeUnit.MILLISECONDS); + TopicDescription topicDescription = getTopicDescription(_adminClient, _topic); + int partitionNum = topicDescription.partitions().size(); + initializeStateForPartitions(partitionNum); + _handleNewPartitionsExecutor.scheduleWithFixedDelay(new NewPartitionHandler(), 1, 30, TimeUnit.SECONDS); LOG.info("{}/ProduceService started", _name); } } - private void initializeStateForPartitions() { - Map keyMapping = generateKeyMappings(); - int partitionNum = _partitionNum.get(); + private void initializeStateForPartitions(int partitionNum) { + Map keyMapping = generateKeyMappings(partitionNum); for (int partition = 0; partition < partitionNum; partition++) { String key = keyMapping.get(partition); - //This is what preserves sequence numbers across restarts + /* This is what preserves sequence numbers across restarts */ if (!_nextIndexPerPartition.containsKey(partition)) { _nextIndexPerPartition.put(partition, new AtomicLong(0)); _sensors.addPartitionSensors(partition); } _produceExecutor.scheduleWithFixedDelay(new ProduceRunnable(partition, key), _produceDelayMs, _produceDelayMs, TimeUnit.MILLISECONDS); } + _partitionNum.set(partitionNum); } - private Map generateKeyMappings() { - int partitionNum = _partitionNum.get(); + private Map generateKeyMappings(int partitionNum) { HashMap keyMapping = new HashMap<>(); int nextInt = 0; @@ -191,87 +201,27 @@ public synchronized void stop() { _produceExecutor.shutdown(); _handleNewPartitionsExecutor.shutdown(); _producer.close(); - LOG.info("{}/ProduceService stopped", _name); + LOG.info("{}/ProduceService stopped.", _name); } } @Override - public void awaitShutdown() { + public void awaitShutdown(long timeout, TimeUnit unit) { try { _produceExecutor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); _handleNewPartitionsExecutor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { - LOG.info("Thread interrupted when waiting for {}/ProduceService to shutdown", _name); + LOG.info("Thread interrupted when waiting for {}/ProduceService to shutdown.", _name); } - LOG.info("{}/ProduceService shutdown completed", _name); + LOG.info("{}/ProduceService shutdown completed.", _name); } + @Override public boolean isRunning() { return _running.get() && !_handleNewPartitionsExecutor.isShutdown(); } - private class ProduceMetrics { - public final Metrics metrics; - private final Sensor _recordsProduced; - private final Sensor _produceError; - private final ConcurrentMap _recordsProducedPerPartition; - private final ConcurrentMap _produceErrorPerPartition; - private final Map _tags; - - public ProduceMetrics(Metrics metrics, final Map tags) { - this.metrics = metrics; - this._tags = tags; - - _recordsProducedPerPartition = new ConcurrentHashMap<>(); - _produceErrorPerPartition = new ConcurrentHashMap<>(); - - _recordsProduced = metrics.sensor("records-produced"); - _recordsProduced.add(new MetricName("records-produced-rate", METRIC_GROUP_NAME, "The average number of records per second that are produced", tags), new Rate()); - _recordsProduced.add(new MetricName("records-produced-total", METRIC_GROUP_NAME, "The total number of records that are produced", tags), new Total()); - - _produceError = metrics.sensor("produce-error"); - _produceError.add(new MetricName("produce-error-rate", METRIC_GROUP_NAME, "The average number of errors per second", tags), new Rate()); - _produceError.add(new MetricName("produce-error-total", METRIC_GROUP_NAME, "The total number of errors", tags), new Total()); - - metrics.addMetric(new MetricName("produce-availability-avg", METRIC_GROUP_NAME, "The average produce availability", tags), - new Measurable() { - @Override - public double measure(MetricConfig config, long now) { - double availabilitySum = 0.0; - int partitionNum = _partitionNum.get(); - for (int partition = 0; partition < partitionNum; partition++) { - double recordsProduced = _sensors.metrics.metrics().get(new MetricName("records-produced-rate-partition-" + partition, METRIC_GROUP_NAME, tags)).value(); - double produceError = _sensors.metrics.metrics().get(new MetricName("produce-error-rate-partition-" + partition, METRIC_GROUP_NAME, tags)).value(); - // If there is no error, error rate sensor may expire and the value may be NaN. Treat NaN as 0 for error rate. - if (Double.isNaN(produceError) || Double.isInfinite(produceError)) { - produceError = 0; - } - // If there is either succeeded or failed produce to a partition, consider its availability as 0. - if (recordsProduced + produceError > 0) { - availabilitySum += recordsProduced / (recordsProduced + produceError); - } - } - // Assign equal weight to per-partition availability when calculating overall availability - return availabilitySum / partitionNum; - } - } - ); - } - - void addPartitionSensors(int partition) { - Sensor recordsProducedSensor = metrics.sensor("records-produced-partition-" + partition); - recordsProducedSensor.add(new MetricName("records-produced-rate-partition-" + partition, METRIC_GROUP_NAME, - "The average number of records per second that are produced to this partition", _tags), new Rate()); - _recordsProducedPerPartition.put(partition, recordsProducedSensor); - - Sensor errorsSensor = metrics.sensor("produce-error-partition-" + partition); - errorsSensor.add(new MetricName("produce-error-rate-partition-" + partition, METRIC_GROUP_NAME, - "The average number of errors per second when producing to this partition", _tags), new Rate()); - _produceErrorPerPartition.put(partition, errorsSensor); - } - } - /** * This creates the records sent to the consumer. */ @@ -287,11 +237,14 @@ private class ProduceRunnable implements Runnable { public void run() { try { long nextIndex = _nextIndexPerPartition.get(_partition).get(); - String message = Utils.jsonFromFields(_topic, nextIndex, System.currentTimeMillis(), _producerId, _recordSize); + long currMs = System.currentTimeMillis(); + String message = Utils.jsonFromFields(_topic, nextIndex, currMs, _producerId, _recordSize); BaseProducerRecord record = new BaseProducerRecord(_topic, _partition, _key, message); RecordMetadata metadata = _producer.send(record, _sync); + _sensors._produceDelay.record(System.currentTimeMillis() - currMs); _sensors._recordsProduced.record(); _sensors._recordsProducedPerPartition.get(_partition).record(); + _sensors._produceErrorInLastSendPerPartition.put(_partition, false); if (nextIndex == -1 && _sync) { nextIndex = metadata.offset(); } else { @@ -301,6 +254,7 @@ public void run() { } catch (Exception e) { _sensors._produceError.record(); _sensors._produceErrorPerPartition.get(_partition).record(); + _sensors._produceErrorInLastSendPerPartition.put(_partition, true); LOG.warn(_name + " failed to send message", e); } } @@ -313,37 +267,44 @@ public void run() { * sensors are added for the new partitions. */ private class NewPartitionHandler implements Runnable { - public void run() { - int currentPartitionCount = Utils.getPartitionNumForTopic(_zkConnect, _topic); - if (currentPartitionCount <= 0) { - LOG.info("{}/ProduceService topic {} does not exist.", _name, _topic); - return; - } else if (currentPartitionCount == _partitionNum.get()) { - return; - } - LOG.info("{}/ProduceService detected new partitions of topic {}", _name, _topic); - //TODO: Should the ProduceService exit if we can't restart the producer runnables? - _produceExecutor.shutdown(); + LOG.debug("{}/ProduceService check partition number for topic {}.", _name, _topic); try { - _produceExecutor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + int currentPartitionNum = + _adminClient.describeTopics(Collections.singleton(_topic)).all().get().get(_topic).partitions().size(); + if (currentPartitionNum <= 0) { + LOG.info("{}/ProduceService topic {} does not exist.", _name, _topic); + return; + } else if (currentPartitionNum == _partitionNum.get()) { + return; + } + LOG.info("{}/ProduceService detected new partitions of topic {}", _name, _topic); + //TODO: Should the ProduceService exit if we can't restart the producer runnables? + _produceExecutor.shutdown(); + try { + _produceExecutor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + throw new IllegalStateException(e); + } + _producer.close(); + try { + initializeProducer(new HashMap<>()); + } catch (Exception e) { + LOG.error("Failed to restart producer.", e); + throw new IllegalStateException(e); + } + _produceExecutor = Executors.newScheduledThreadPool(_threadsNum); + initializeStateForPartitions(currentPartitionNum); + LOG.info("New partitions added to monitoring."); } catch (InterruptedException e) { - throw new IllegalStateException(e); + LOG.error("InterruptedException occurred.", e); + } catch (ExecutionException e) { + LOG.error("ExecutionException occurred.", e); } - _producer.close(); - _partitionNum.set(currentPartitionCount); - try { - initializeProducer(); - } catch (Exception e) { - LOG.error("Failed to restart producer.", e); - throw new IllegalStateException(e); - } - _produceExecutor = Executors.newScheduledThreadPool(_threadsNum); - initializeStateForPartitions(); - LOG.info("New partitions added to monitoring."); } } + private class ProduceServiceThreadFactory implements ThreadFactory { private final AtomicInteger _threadId = new AtomicInteger(); @@ -358,4 +319,4 @@ public Thread newThread(Runnable r) { } } -} \ No newline at end of file +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ProduceServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/ProduceServiceFactory.java new file mode 100644 index 00000000..9769bac1 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ProduceServiceFactory.java @@ -0,0 +1,34 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory that constructs the ProduceService + */ +@SuppressWarnings("rawtypes") +public class ProduceServiceFactory implements ServiceFactory { + private final Map _props; + private final String _name; + + public ProduceServiceFactory(Map props, String name) { + _props = props; + _name = name; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + return new ProduceService(_props, _name); + } +} diff --git a/src/main/java/com/linkedin/kmf/services/Service.java b/src/main/java/com/linkedin/xinfra/monitor/services/Service.java similarity index 84% rename from src/main/java/com/linkedin/kmf/services/Service.java rename to src/main/java/com/linkedin/xinfra/monitor/services/Service.java index d18318b3..62f85c62 100644 --- a/src/main/java/com/linkedin/kmf/services/Service.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/Service.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,11 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; + +package com.linkedin.xinfra.monitor.services; + +import java.util.concurrent.TimeUnit; + /** * Services are components of a monitoring application that are expected to be running continuously in order to perform @@ -15,7 +19,7 @@ */ public interface Service { - static final String JMX_PREFIX = "kmf.services"; + String JMX_PREFIX = "kmf.services"; /** * The start logic must only execute once. If an error occurs then the implementer of this class must assume that @@ -43,5 +47,10 @@ public interface Service { /** * Implementations of this method must be thread safe and must be blocking. */ - void awaitShutdown(); + void awaitShutdown(long timeout, TimeUnit unit); + + default String getServiceName() { + return this.getClass().getSimpleName(); + } + } diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/ServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/ServiceFactory.java new file mode 100644 index 00000000..a6761654 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/ServiceFactory.java @@ -0,0 +1,33 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +/** + * Factory that instantiates an instance of Xinfra Monitor Service. + * + * INFORMATION: + * "Class 'ClusterTopicManipulationServiceFactory' is never used" and + * "Constructor 'ClusterTopicManipulationServiceFactory(java.util.Map, java.lang.String)' is never used" + * shown as warnings in Intellij IDEA are not true. + * XinfraMonitor class uses (ServiceFactory) Class.forName(..) + * .getConstructor(...).newInstance(...) to return Class that's associated + * with the class or interface with the given string name + */ +public interface ServiceFactory { + + /** + * This method creates a Xinfra Montior Service. + * @return a Xinrfa Monitor service object + * @throws Exception that occurs while creating a XM Service + */ + Service createService() throws Exception; + +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterService.java b/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterService.java new file mode 100644 index 00000000..e84f1200 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterService.java @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2018 SignalFx, Inc. Licensed under the Apache 2 License. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.codahale.metrics.MetricRegistry; +import com.linkedin.xinfra.monitor.common.MbeanAttributeValue; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.SignalFxMetricsReporterServiceConfig; +import com.signalfx.codahale.metrics.SettableDoubleGauge; +import com.signalfx.codahale.reporter.MetricMetadata; +import com.signalfx.codahale.reporter.SignalFxReporter; +import com.signalfx.endpoint.SignalFxEndpoint; +import java.net.URL; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SignalFxMetricsReporterService implements Service { + private static final Logger LOG = LoggerFactory.getLogger(SignalFxMetricsReporterService.class); + + private final String _name; + private final List _metricNames; + private final int _reportIntervalSec; + private final ScheduledExecutorService _executor; + private final MetricRegistry _metricRegistry; + private final SignalFxReporter _signalfxReporter; + + private final MetricMetadata _metricMetadata; + private final Map _metricMap; + private Map _dimensionsMap; + + public SignalFxMetricsReporterService(Map props, String name) throws Exception { + SignalFxMetricsReporterServiceConfig config = new SignalFxMetricsReporterServiceConfig(props); + + _name = name; + _metricNames = config.getList(SignalFxMetricsReporterServiceConfig.REPORT_METRICS_CONFIG); + _reportIntervalSec = config.getInt(SignalFxMetricsReporterServiceConfig.REPORT_INTERVAL_SEC_CONFIG); + String signalfxUrl = config.getString(SignalFxMetricsReporterServiceConfig.REPORT_SIGNALFX_URL); + String signalfxToken = config.getString(SignalFxMetricsReporterServiceConfig.SIGNALFX_TOKEN); + + if (StringUtils.isEmpty(signalfxToken)) { + throw new IllegalArgumentException("SignalFx token is not configured"); + } + + _executor = Executors.newSingleThreadScheduledExecutor(); + _metricRegistry = new MetricRegistry(); + _metricMap = new HashMap(); + _dimensionsMap = new HashMap(); + if (props.containsKey(SignalFxMetricsReporterServiceConfig.SIGNALFX_METRIC_DIMENSION)) { + _dimensionsMap = (Map) props.get(SignalFxMetricsReporterServiceConfig.SIGNALFX_METRIC_DIMENSION); + } + + SignalFxReporter.Builder sfxReportBuilder = new SignalFxReporter.Builder( + _metricRegistry, signalfxToken + ); + if (!StringUtils.isEmpty(signalfxUrl)) { + sfxReportBuilder.setEndpoint(getSignalFxEndpoint(signalfxUrl)); + } + _signalfxReporter = sfxReportBuilder.build(); + + _metricMetadata = _signalfxReporter.getMetricMetadata(); + } + + @Override + public synchronized void start() { + _signalfxReporter.start(_reportIntervalSec, TimeUnit.SECONDS); + _executor.scheduleAtFixedRate(() -> { + try { + captureMetrics(); + } catch (Exception e) { + LOG.error(_name + "/SignalFxMetricsReporterService failed to report metrics", e); + } + }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS); + LOG.info("{}/SignalFxMetricsReporterService started", _name); + } + + @Override + public synchronized void stop() { + _executor.shutdown(); + _signalfxReporter.stop(); + LOG.info("{}/SignalFxMetricsReporterService stopped", _name); + } + + @Override + public boolean isRunning() { + return !_executor.isShutdown(); + } + + @Override + public void awaitShutdown(long timeout, TimeUnit unit) { + try { + _executor.awaitTermination(5, TimeUnit.MINUTES); + } catch (InterruptedException e) { + LOG.info("Thread interrupted when waiting for {}/SignalFxMetricsReporterService to shutdown", _name); + } + LOG.info("{}/SignalFxMetricsReporterService shutdown completed", _name); + } + + + private SignalFxEndpoint getSignalFxEndpoint(String urlStr) throws Exception { + URL url = new URL(urlStr); + return new SignalFxEndpoint(url.getProtocol(), url.getHost(), url.getPort()); + } + + private String generateSignalFxMetricName(String bean, String attribute) { + String service = bean.split(":")[1]; + String serviceType = service.split(",")[1].split("=")[1]; + return String.format("%s.%s", serviceType, attribute); + } + + private void captureMetrics() { + for (String metricName : _metricNames) { + int index = metricName.lastIndexOf(':'); + String mbeanExpr = metricName.substring(0, index); + String attributeExpr = metricName.substring(index + 1); + + List attributeValues = Utils.getMBeanAttributeValues(mbeanExpr, attributeExpr); + + for (final MbeanAttributeValue attributeValue : attributeValues) { + String metric = attributeValue.toString(); + String key = metric.substring(0, metric.lastIndexOf("=")); + String[] parts = key.split(","); + if (parts.length < 2) { + continue; + } + parts = parts[0].split("="); + if (parts.length < 2 || !parts[1].contains("cluster-monitor")) { + continue; + } + setMetricValue(attributeValue); + } + } + } + + private void setMetricValue(MbeanAttributeValue attributeValue) { + String key = attributeValue.mbean() + attributeValue.attribute(); + SettableDoubleGauge metric = _metricMap.get(key); + if (metric == null) { + metric = createMetric(attributeValue); + _metricMap.put(key, metric); + } + metric.setValue(attributeValue.value()); + } + + private SettableDoubleGauge createMetric(MbeanAttributeValue attributeValue) { + String signalFxMetricName = generateSignalFxMetricName(attributeValue.mbean(), attributeValue.attribute()); + SettableDoubleGauge gauge; + + if (signalFxMetricName.contains("partition")) { + gauge = createPartitionMetric(signalFxMetricName); + } else { + gauge = _metricMetadata.forMetric(new SettableDoubleGauge()) + .withMetricName(signalFxMetricName).metric(); + } + LOG.info("Creating metric : {}", signalFxMetricName); + + for (Map.Entry entry : _dimensionsMap.entrySet()) { + _metricMetadata.forMetric(gauge).withDimension(entry.getKey(), entry.getValue()); + } + _metricMetadata.forMetric(gauge).register(_metricRegistry); + + return gauge; + } + + private SettableDoubleGauge createPartitionMetric(String signalFxMetricName) { + int divider = signalFxMetricName.lastIndexOf('-'); + String partitionNumber = signalFxMetricName.substring(divider + 1); + signalFxMetricName = signalFxMetricName.substring(0, divider); + SettableDoubleGauge gauge = _metricMetadata.forMetric(new SettableDoubleGauge()) + .withMetricName(signalFxMetricName).metric(); + _metricMetadata.forMetric(gauge).withDimension("partition", partitionNumber); + return gauge; + } +} + diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterServiceFactory.java new file mode 100644 index 00000000..9ffc6bbd --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/SignalFxMetricsReporterServiceFactory.java @@ -0,0 +1,37 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class which instantiates a SignalFxMetricsReporterServiceFactory service. + */ +@SuppressWarnings("rawtypes") +public class SignalFxMetricsReporterServiceFactory implements ServiceFactory { + + private final Map _properties; + private final String _serviceName; + + public SignalFxMetricsReporterServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + return new SignalFxMetricsReporterService(_properties, _serviceName); + } +} + diff --git a/src/main/java/com/linkedin/kmf/services/StatsdMetricsReporterService.java b/src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterService.java similarity index 71% rename from src/main/java/com/linkedin/kmf/services/StatsdMetricsReporterService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterService.java index 9838b350..77ce1307 100644 --- a/src/main/java/com/linkedin/kmf/services/StatsdMetricsReporterService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,24 +7,23 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; +package com.linkedin.xinfra.monitor.services; -import com.linkedin.kmf.common.MbeanAttributeValue; -import com.linkedin.kmf.services.configs.StatsdMetricsReporterServiceConfig; +import com.linkedin.xinfra.monitor.common.MbeanAttributeValue; +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.services.configs.StatsdMetricsReporterServiceConfig; import com.timgroup.statsd.NonBlockingStatsDClient; import com.timgroup.statsd.StatsDClient; -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import static com.linkedin.kmf.common.Utils.getMBeanAttributeValues; public class StatsdMetricsReporterService implements Service { private static final Logger LOG = LoggerFactory.getLogger(StatsdMetricsReporterService.class); @@ -34,7 +33,6 @@ public class StatsdMetricsReporterService implements Service { private final int _reportIntervalSec; private final ScheduledExecutorService _executor; private final StatsDClient _statsdClient; - private final String _metricNamePrefix; public StatsdMetricsReporterService(Map props, String name) { StatsdMetricsReporterServiceConfig config = new StatsdMetricsReporterServiceConfig(props); @@ -43,25 +41,20 @@ public StatsdMetricsReporterService(Map props, String name) { _metricNames = config.getList(StatsdMetricsReporterServiceConfig.REPORT_METRICS_CONFIG); _reportIntervalSec = config.getInt(StatsdMetricsReporterServiceConfig.REPORT_INTERVAL_SEC_CONFIG); _executor = Executors.newSingleThreadScheduledExecutor(); - _metricNamePrefix = config.getString(StatsdMetricsReporterServiceConfig.REPORT_STATSD_PREFIX); - _statsdClient = new NonBlockingStatsDClient(_metricNamePrefix, + _statsdClient = new NonBlockingStatsDClient(config.getString(StatsdMetricsReporterServiceConfig.REPORT_STATSD_PREFIX), config.getString(StatsdMetricsReporterServiceConfig.REPORT_STATSD_HOST), config.getInt(StatsdMetricsReporterServiceConfig.REPORT_STATSD_PORT)); } @Override public synchronized void start() { - _executor.scheduleAtFixedRate( - new Runnable() { - @Override - public void run() { - try { - reportMetrics(); - } catch (Exception e) { - LOG.error(_name + "/StatsdMetricsReporterService failed to report metrics", e); - } - } - }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS + _executor.scheduleAtFixedRate(() -> { + try { + reportMetrics(); + } catch (Exception e) { + LOG.error(_name + "/StatsdMetricsReporterService failed to report metrics", e); + } + }, _reportIntervalSec, _reportIntervalSec, TimeUnit.SECONDS ); LOG.info("{}/StatsdMetricsReporterService started", _name); } @@ -78,7 +71,7 @@ public boolean isRunning() { } @Override - public void awaitShutdown() { + public void awaitShutdown(long timeout, TimeUnit unit) { try { _executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { @@ -87,21 +80,20 @@ public void awaitShutdown() { LOG.info("{}/StatsdMetricsReporterService shutdown completed", _name); } + private String generateStatsdMetricName(String bean, String attribute) { String service = bean.split(":")[1]; String serviceName = service.split(",")[0].split("=")[1]; String serviceType = service.split(",")[1].split("=")[1]; - String[] segs = {_metricNamePrefix, serviceType, serviceName, attribute}; - String metricName = StringUtils.join(segs, "."); - - return _metricNamePrefix.isEmpty() ? metricName.substring(1) : metricName; + String[] segs = {serviceType, serviceName, attribute}; + return StringUtils.join(segs, "."); } private void reportMetrics() { for (String metricName: _metricNames) { String mbeanExpr = metricName.substring(0, metricName.lastIndexOf(":")); String attributeExpr = metricName.substring(metricName.lastIndexOf(":") + 1); - List attributeValues = getMBeanAttributeValues(mbeanExpr, attributeExpr); + List attributeValues = Utils.getMBeanAttributeValues(mbeanExpr, attributeExpr); for (MbeanAttributeValue attributeValue: attributeValues) { final String statsdMetricName = generateStatsdMetricName(attributeValue.mbean(), attributeValue.attribute()); diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterServiceFactory.java new file mode 100644 index 00000000..046d0a2f --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/StatsdMetricsReporterServiceFactory.java @@ -0,0 +1,37 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class that constructs the StatsdMetricsReporterService. + */ +@SuppressWarnings("rawtypes") +public class StatsdMetricsReporterServiceFactory implements ServiceFactory { + private final Map _properties; + private final String _name; + + public StatsdMetricsReporterServiceFactory(Map properties, String name) { + + _properties = properties; + _name = name; + } + + @Override + public Service createService() { + + //noinspection unchecked + return new StatsdMetricsReporterService(_properties, _name); + + } +} diff --git a/src/main/java/com/linkedin/kmf/services/TopicManagementService.java b/src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementService.java similarity index 67% rename from src/main/java/com/linkedin/kmf/services/TopicManagementService.java rename to src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementService.java index 26333b2b..491b2514 100644 --- a/src/main/java/com/linkedin/kmf/services/TopicManagementService.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementService.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -8,12 +8,14 @@ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services; +package com.linkedin.xinfra.monitor.services; -import com.linkedin.kmf.services.configs.MultiClusterTopicManagementServiceConfig; -import com.linkedin.kmf.services.configs.TopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig; +import com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; /** @@ -28,6 +30,10 @@ public TopicManagementService(Map props, String serviceName) thr _multiClusterTopicManagementService = new MultiClusterTopicManagementService(serviceProps, serviceName); } + public CompletableFuture topicPartitionResult() { + return _multiClusterTopicManagementService.topicPartitionResult(); + } + /** * @param props a map of key/value pair used for configuring TopicManagementService * @param serviceName service name @@ -52,8 +58,14 @@ private Map createMultiClusterTopicManagementServiceProps(Map serviceProps = new HashMap<>(); serviceProps.put(MultiClusterTopicManagementServiceConfig.PROPS_PER_CLUSTER_CONFIG, configPerCluster); serviceProps.put(MultiClusterTopicManagementServiceConfig.TOPIC_CONFIG, props.get(TopicManagementServiceConfig.TOPIC_CONFIG)); - if (props.containsKey(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG)) - serviceProps.put(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG, props.get(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG)); + Object providedRebalanceIntervalMsConfig = props.get(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG); + if (providedRebalanceIntervalMsConfig != null) { + serviceProps.put(MultiClusterTopicManagementServiceConfig.REBALANCE_INTERVAL_MS_CONFIG, providedRebalanceIntervalMsConfig); + } + Object providedPreferredLeaderElectionIntervalMsConfig = props.get(MultiClusterTopicManagementServiceConfig.PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG); + if (providedPreferredLeaderElectionIntervalMsConfig != null) { + serviceProps.put(MultiClusterTopicManagementServiceConfig.PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG, providedPreferredLeaderElectionIntervalMsConfig); + } return serviceProps; } @@ -62,6 +74,7 @@ public synchronized void start() { _multiClusterTopicManagementService.start(); } + @Override public synchronized void stop() { _multiClusterTopicManagementService.stop(); @@ -73,8 +86,9 @@ public boolean isRunning() { } @Override - public void awaitShutdown() { - _multiClusterTopicManagementService.awaitShutdown(); + public void awaitShutdown(long timeout, TimeUnit unit) { + _multiClusterTopicManagementService.awaitShutdown(timeout, unit); } + } diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceFactory.java b/src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceFactory.java new file mode 100644 index 00000000..b27ea335 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceFactory.java @@ -0,0 +1,37 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import java.util.Map; + + +/** + * Factory class which constructs the TopicManagementService. + */ +@SuppressWarnings("rawtypes") +public class TopicManagementServiceFactory implements ServiceFactory { + private final Map _properties; + private final String _serviceName; + + public TopicManagementServiceFactory(Map properties, String serviceName) { + + _properties = properties; + _serviceName = serviceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + + return new TopicManagementService(_properties, _serviceName); + + } +} diff --git a/src/main/java/com/linkedin/kmf/services/configs/CommonServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/CommonServiceConfig.java similarity index 85% rename from src/main/java/com/linkedin/kmf/services/configs/CommonServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/CommonServiceConfig.java index 71c05c23..52d1ba47 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/CommonServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/CommonServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,17 +7,21 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; + +package com.linkedin.xinfra.monitor.services.configs; import org.apache.kafka.clients.CommonClientConfigs; public class CommonServiceConfig { + public static final String CONSUMER_PROPS_CONFIG = "consumer.props"; + public static final String CONSUMER_PROPS_DOC = "consumer props"; + public static final String ZOOKEEPER_CONNECT_CONFIG = "zookeeper.connect"; public static final String ZOOKEEPER_CONNECT_DOC = "Zookeeper connect string."; public static final String BOOTSTRAP_SERVERS_CONFIG = CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - public static final String BOOTSTRAP_SERVERS_DOC = CommonClientConfigs.BOOSTRAP_SERVERS_DOC; + public static final String BOOTSTRAP_SERVERS_DOC = CommonClientConfigs.BOOTSTRAP_SERVERS_DOC; public static final String TOPIC_CONFIG = "topic"; public static final String TOPIC_DOC = "Topic to be used by the service."; @@ -31,4 +35,4 @@ public class CommonServiceConfig { public static final String REPORT_INTERVAL_SEC_CONFIG = "report.interval.sec"; public static final String REPORT_INTERVAL_SEC_DOC = "The interval in second by which metrics reporter service will report the metrics values."; -} \ No newline at end of file +} diff --git a/src/main/java/com/linkedin/kmf/services/configs/ConsumeServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/ConsumeServiceConfig.java similarity index 94% rename from src/main/java/com/linkedin/kmf/services/configs/ConsumeServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/ConsumeServiceConfig.java index dd58f6f4..a5764fa0 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/ConsumeServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/ConsumeServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,13 +7,14 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; +package com.linkedin.xinfra.monitor.services.configs; -import com.linkedin.kmf.consumer.NewConsumer; +import com.linkedin.xinfra.monitor.consumer.NewConsumer; import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; + public class ConsumeServiceConfig extends AbstractConfig { private static final ConfigDef CONFIG; @@ -29,7 +30,7 @@ public class ConsumeServiceConfig extends AbstractConfig { public static final String CONSUMER_CLASS_CONFIG = "consume.consumer.class"; public static final String CONSUMER_CLASS_DOC = "Consumer class that will be instantiated as consumer in the consume service. " - + "It can be NewConsumer, OldConsumer, or full class name of any class that implements the KMBaseConsumer interface."; + + "It can be NewConsumer or full class name of any class that implements the KMBaseConsumer interface."; public static final String LATENCY_PERCENTILE_MAX_MS_CONFIG = "consume.latency.percentile.max.ms"; public static final String LATENCY_PERCENTILE_MAX_MS_DOC = "This is used to derive the bucket number used to configure latency percentile metric. " @@ -79,7 +80,6 @@ public class ConsumeServiceConfig extends AbstractConfig { 20000, ConfigDef.Importance.MEDIUM, LATENCY_SLA_MS_DOC); - } public ConsumeServiceConfig(Map props) { diff --git a/src/main/java/com/linkedin/kmf/services/configs/DefaultMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/DefaultMetricsReporterServiceConfig.java similarity index 86% rename from src/main/java/com/linkedin/kmf/services/configs/DefaultMetricsReporterServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/DefaultMetricsReporterServiceConfig.java index 0c8c9130..575a553e 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/DefaultMetricsReporterServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/DefaultMetricsReporterServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,12 +7,13 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; +package com.linkedin.xinfra.monitor.services.configs; + +import java.util.Collections; +import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; -import java.util.Arrays; -import java.util.Map; public class DefaultMetricsReporterServiceConfig extends AbstractConfig { @@ -26,8 +27,7 @@ public class DefaultMetricsReporterServiceConfig extends AbstractConfig { static { CONFIG = new ConfigDef().define(REPORT_METRICS_CONFIG, - ConfigDef.Type.LIST, - Arrays.asList("kmf.services:*:*"), + ConfigDef.Type.LIST, Collections.singletonList("kmf.services:*:*"), ConfigDef.Importance.MEDIUM, REPORT_METRICS_DOC) .define(REPORT_INTERVAL_SEC_CONFIG, diff --git a/src/main/java/com/linkedin/kmf/services/configs/GraphiteMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/GraphiteMetricsReporterServiceConfig.java similarity index 92% rename from src/main/java/com/linkedin/kmf/services/configs/GraphiteMetricsReporterServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/GraphiteMetricsReporterServiceConfig.java index 827cb7e4..29ea04ed 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/GraphiteMetricsReporterServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/GraphiteMetricsReporterServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,13 +7,13 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; -import org.apache.kafka.common.config.AbstractConfig; -import org.apache.kafka.common.config.ConfigDef; +package com.linkedin.xinfra.monitor.services.configs; -import java.util.Arrays; +import java.util.Collections; import java.util.Map; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; public class GraphiteMetricsReporterServiceConfig extends AbstractConfig { private static final ConfigDef CONFIG; @@ -35,8 +35,7 @@ public class GraphiteMetricsReporterServiceConfig extends AbstractConfig { static { CONFIG = new ConfigDef().define(REPORT_METRICS_CONFIG, - ConfigDef.Type.LIST, - Arrays.asList("kmf.services:*:*"), + ConfigDef.Type.LIST, Collections.singletonList("kmf.services:*:*"), ConfigDef.Importance.MEDIUM, REPORT_METRICS_DOC) .define(REPORT_INTERVAL_SEC_CONFIG, diff --git a/src/main/java/com/linkedin/kmf/services/configs/JettyServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/JettyServiceConfig.java similarity index 90% rename from src/main/java/com/linkedin/kmf/services/configs/JettyServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/JettyServiceConfig.java index 38676631..06fd6e25 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/JettyServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/JettyServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; + +package com.linkedin.xinfra.monitor.services.configs; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/configs/KafkaMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/KafkaMetricsReporterServiceConfig.java new file mode 100644 index 00000000..d6c30ac4 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/KafkaMetricsReporterServiceConfig.java @@ -0,0 +1,76 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.configs; + +import java.util.Collections; +import java.util.Map; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +import static org.apache.kafka.common.config.ConfigDef.Range.atLeast; + +public class KafkaMetricsReporterServiceConfig extends AbstractConfig { + + private static final ConfigDef CONFIG; + + public static final String REPORT_METRICS_CONFIG = CommonServiceConfig.REPORT_METRICS_CONFIG; + public static final String REPORT_METRICS_DOC = CommonServiceConfig.REPORT_METRICS_DOC; + + public static final String REPORT_INTERVAL_SEC_CONFIG = CommonServiceConfig.REPORT_INTERVAL_SEC_CONFIG; + public static final String REPORT_INTERVAL_SEC_DOC = CommonServiceConfig.REPORT_INTERVAL_SEC_DOC; + + public static final String ZOOKEEPER_CONNECT_CONFIG = CommonServiceConfig.ZOOKEEPER_CONNECT_CONFIG; + public static final String ZOOKEEPER_CONNECT_DOC = CommonServiceConfig.ZOOKEEPER_CONNECT_DOC; + + public static final String BOOTSTRAP_SERVERS_CONFIG = CommonServiceConfig.BOOTSTRAP_SERVERS_CONFIG; + public static final String BOOTSTRAP_SERVERS_DOC = CommonServiceConfig.BOOTSTRAP_SERVERS_DOC; + + public static final String TOPIC_CONFIG = CommonServiceConfig.TOPIC_CONFIG; + public static final String TOPIC_DOC = CommonServiceConfig.TOPIC_DOC; + + public static final String TOPIC_REPLICATION_FACTOR = "report.kafka.topic.replication.factor"; + public static final String TOPIC_REPLICATION_FACTOR_DOC = "This replication factor is used to create the metrics reporter topic."; + + + static { + CONFIG = new ConfigDef().define(REPORT_METRICS_CONFIG, + ConfigDef.Type.LIST, Collections.singletonList("kmf.services:*:*"), + ConfigDef.Importance.MEDIUM, + REPORT_METRICS_DOC) + .define(REPORT_INTERVAL_SEC_CONFIG, + ConfigDef.Type.INT, + 1, + ConfigDef.Importance.LOW, + REPORT_INTERVAL_SEC_DOC) + .define(ZOOKEEPER_CONNECT_CONFIG, + ConfigDef.Type.STRING, + ConfigDef.Importance.HIGH, + ZOOKEEPER_CONNECT_DOC) + .define(BOOTSTRAP_SERVERS_CONFIG, + ConfigDef.Type.STRING, + ConfigDef.Importance.HIGH, + BOOTSTRAP_SERVERS_DOC) + .define(TOPIC_CONFIG, + ConfigDef.Type.STRING, + ConfigDef.Importance.HIGH, + TOPIC_DOC) + .define(TOPIC_REPLICATION_FACTOR, + ConfigDef.Type.INT, + 1, + atLeast(1), + ConfigDef.Importance.LOW, + TOPIC_REPLICATION_FACTOR_DOC); + } + + public KafkaMetricsReporterServiceConfig(Map props) { + super(CONFIG, props); + } +} diff --git a/src/main/java/com/linkedin/kmf/services/configs/MultiClusterTopicManagementServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/MultiClusterTopicManagementServiceConfig.java similarity index 71% rename from src/main/java/com/linkedin/kmf/services/configs/MultiClusterTopicManagementServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/MultiClusterTopicManagementServiceConfig.java index 5ac0f61a..73138c00 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/MultiClusterTopicManagementServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/MultiClusterTopicManagementServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,7 +7,8 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; + +package com.linkedin.xinfra.monitor.services.configs; import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; @@ -29,6 +30,10 @@ public class MultiClusterTopicManagementServiceConfig extends AbstractConfig { public static final String REBALANCE_INTERVAL_MS_DOC = "The gap in ms between the times the cluster balance on the " + "monitor topic is checked. Set this to a large value to disable automatic topic rebalance."; + public static final String PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG = "topic-management.preferred.leader.election.check.interval.ms"; + public static final String PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_DOC = "The gap in ms between the times to check if preferred leader election" + + " can be performed when requested during rebalance"; + static { CONFIG = new ConfigDef() .define(TOPIC_CONFIG, @@ -40,7 +45,12 @@ public class MultiClusterTopicManagementServiceConfig extends AbstractConfig { 1000 * 60 * 10, atLeast(10), ConfigDef.Importance.LOW, - REBALANCE_INTERVAL_MS_DOC); + REBALANCE_INTERVAL_MS_DOC) + .define(PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_CONFIG, + ConfigDef.Type.LONG, + 1000 * 60 * 5, + atLeast(5), + ConfigDef.Importance.LOW, PREFERRED_LEADER_ELECTION_CHECK_INTERVAL_MS_DOC); } public MultiClusterTopicManagementServiceConfig(Map props) { diff --git a/src/main/java/com/linkedin/kmf/services/configs/ProduceServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/ProduceServiceConfig.java similarity index 68% rename from src/main/java/com/linkedin/kmf/services/configs/ProduceServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/ProduceServiceConfig.java index 0a613971..1c434193 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/ProduceServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/ProduceServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,10 +7,11 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; -import com.linkedin.kmf.partitioner.NewKMPartitioner; -import com.linkedin.kmf.producer.NewProducer; +package com.linkedin.xinfra.monitor.services.configs; + +import com.linkedin.xinfra.monitor.partitioner.NewKMPartitioner; +import com.linkedin.xinfra.monitor.producer.NewProducer; import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; @@ -55,6 +56,20 @@ public class ProduceServiceConfig extends AbstractConfig { public static final String PRODUCER_PROPS_CONFIG = "produce.producer.props"; public static final String PRODUCER_PROPS_DOC = "The properties used to config producer in produce service."; + public static final String LATENCY_PERCENTILE_MAX_MS_CONFIG = "produce.latency.percentile.max.ms"; + public static final String LATENCY_PERCENTILE_MAX_MS_DOC = "This is used to derive the bucket number used to configure latency percentile metric. " + + "Any latency larger than this max value will be rounded down to the max value."; + + public static final String LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG = "produce.latency.percentile.granularity.ms"; + public static final String LATENCY_PERCENTILE_GRANULARITY_MS_DOC = "This is used to derive the bucket number used to configure latency percentile metric. " + + "The latency at the specified percentile should be multiple of this value."; + + public static final String PRODUCER_TREAT_ZERO_THROUGHPUT_AS_UNAVAILABLE_CONFIG = "produce.treat.zero.throughput.as.unavailable"; + public static final String PRODUCER_TREAT_ZERO_THROUGHPUT_AS_UNAVAILABLE_DOC = "If it is set to true, produce availability is set to 0 " + + "if no message can be produced, regardless of whether there is exception. If this is set to false, availability will only drop below 1 if there is exception " + + "thrown from producer. Depending on the producer configuration, it may take a few minutes for producer to be blocked before it throws exception. Advanced user " + + "may want to set this flag to false to exactly measure the availability experienced by users"; + static { CONFIG = new ConfigDef().define(ZOOKEEPER_CONNECT_CONFIG, ConfigDef.Type.STRING, @@ -98,6 +113,21 @@ public class ProduceServiceConfig extends AbstractConfig { 100, ConfigDef.Importance.LOW, PRODUCE_RECORD_SIZE_BYTE_DOC) + .define(PRODUCER_TREAT_ZERO_THROUGHPUT_AS_UNAVAILABLE_CONFIG, + ConfigDef.Type.BOOLEAN, + true, + ConfigDef.Importance.MEDIUM, + PRODUCER_TREAT_ZERO_THROUGHPUT_AS_UNAVAILABLE_DOC) + .define(LATENCY_PERCENTILE_MAX_MS_CONFIG, + ConfigDef.Type.INT, + 5000, + ConfigDef.Importance.LOW, + LATENCY_PERCENTILE_MAX_MS_DOC) + .define(LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG, + ConfigDef.Type.INT, + 1, + ConfigDef.Importance.LOW, + LATENCY_PERCENTILE_GRANULARITY_MS_DOC) .define(PRODUCE_THREAD_NUM_CONFIG, ConfigDef.Type.INT, 5, diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/configs/SignalFxMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/SignalFxMetricsReporterServiceConfig.java new file mode 100644 index 00000000..5a8e3e9a --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/SignalFxMetricsReporterServiceConfig.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2018 SignalFx, Inc. Licensed under the Apache 2 License. + */ + +package com.linkedin.xinfra.monitor.services.configs; + +import java.util.Collections; +import java.util.Map; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +/** + * key/value pair used for configuring SignalFxMetricsReporterService + * + */ +public class SignalFxMetricsReporterServiceConfig extends AbstractConfig { + private static final ConfigDef CONFIG; + + public static final String REPORT_METRICS_CONFIG = "report.metrics.list"; + public static final String REPORT_METRICS_DOC = CommonServiceConfig.REPORT_METRICS_DOC; + + public static final String REPORT_INTERVAL_SEC_CONFIG = CommonServiceConfig.REPORT_INTERVAL_SEC_CONFIG; + public static final String REPORT_INTERVAL_SEC_DOC = CommonServiceConfig.REPORT_INTERVAL_SEC_DOC; + + public static final String REPORT_SIGNALFX_URL = "report.signalfx.url"; + public static final String REPORT_SIGNALFX_URL_DOC = "The url of signalfx server which SignalFxMetricsReporterService will report the metrics values."; + + public static final String SIGNALFX_METRIC_DIMENSION = "report.metric.dimensions"; + public static final String SIGNALFX_METRIC_DIMENSION_DOC = "Dimensions added to each metric. Example: {\"key1:value1\", \"key2:value2\"} "; + + public static final String SIGNALFX_TOKEN = "report.signalfx.token"; + public static final String SIGNALFX_TOKEN_DOC = "SignalFx access token"; + + static { + CONFIG = new ConfigDef().define(REPORT_METRICS_CONFIG, + ConfigDef.Type.LIST, Collections.singletonList("kmf.services:*:*"), + ConfigDef.Importance.MEDIUM, + REPORT_METRICS_DOC) + .define(REPORT_INTERVAL_SEC_CONFIG, + ConfigDef.Type.INT, + 1, + ConfigDef.Importance.LOW, + REPORT_INTERVAL_SEC_DOC) + .define(REPORT_SIGNALFX_URL, + ConfigDef.Type.STRING, + "", + ConfigDef.Importance.LOW, + REPORT_SIGNALFX_URL_DOC) + .define(SIGNALFX_TOKEN, + ConfigDef.Type.STRING, + "", + ConfigDef.Importance.HIGH, + SIGNALFX_TOKEN_DOC); + } + + public SignalFxMetricsReporterServiceConfig(Map props) { + super(CONFIG, props); + } +} + diff --git a/src/main/java/com/linkedin/kmf/services/configs/StatsdMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java similarity index 67% rename from src/main/java/com/linkedin/kmf/services/configs/StatsdMetricsReporterServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java index a0b98efc..8438fd78 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/StatsdMetricsReporterServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -8,37 +8,16 @@ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * - * In order to enable the StatsD metrics export, add the following section to kafka-monitor.properties file + * In order to enable the StatsD metrics export, add the following section to xinfra-monitor.properties file * - ========================================================================================== - "statsd-service": { - "class.name": "com.linkedin.kmf.services.StatsdMetricsReporterService", - "report.statsd.host": "localhost", - "report.statsd.port": "8125", - "report.statsd.prefix": "kafka-monitor", - "report.interval.sec": 1, - "report.metrics.list": [ - "kmf.services:type=produce-service,name=*:produce-availability-avg", - "kmf.services:type=consume-service,name=*:consume-availability-avg", - "kmf.services:type=produce-service,name=*:records-produced-total", - "kmf.services:type=consume-service,name=*:records-consumed-total", - "kmf.services:type=consume-service,name=*:records-lost-total", - "kmf.services:type=consume-service,name=*:records-duplicated-total", - "kmf.services:type=consume-service,name=*:records-delay-ms-avg", - "kmf.services:type=produce-service,name=*:records-produced-rate", - "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate" - ] - } - ========================================================================================== */ -package com.linkedin.kmf.services.configs; -import org.apache.kafka.common.config.AbstractConfig; -import org.apache.kafka.common.config.ConfigDef; +package com.linkedin.xinfra.monitor.services.configs; -import java.util.Arrays; +import java.util.Collections; import java.util.Map; +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; public class StatsdMetricsReporterServiceConfig extends AbstractConfig { private static final ConfigDef CONFIG; @@ -60,8 +39,7 @@ public class StatsdMetricsReporterServiceConfig extends AbstractConfig { static { CONFIG = new ConfigDef().define(REPORT_METRICS_CONFIG, - ConfigDef.Type.LIST, - Arrays.asList("kmf.services:*:*"), + ConfigDef.Type.LIST, Collections.singletonList("kmf.services:*:*"), ConfigDef.Importance.MEDIUM, REPORT_METRICS_DOC) .define(REPORT_INTERVAL_SEC_CONFIG, diff --git a/src/main/java/com/linkedin/kmf/services/configs/TopicManagementServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/TopicManagementServiceConfig.java similarity index 68% rename from src/main/java/com/linkedin/kmf/services/configs/TopicManagementServiceConfig.java rename to src/main/java/com/linkedin/xinfra/monitor/services/configs/TopicManagementServiceConfig.java index 36a1b947..8518ccf4 100644 --- a/src/main/java/com/linkedin/kmf/services/configs/TopicManagementServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/TopicManagementServiceConfig.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,10 +7,11 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.services.configs; -import com.linkedin.kmf.topicfactory.DefaultTopicFactory; -import com.linkedin.kmf.topicfactory.TopicFactory; +package com.linkedin.xinfra.monitor.services.configs; + +import com.linkedin.xinfra.monitor.topicfactory.DefaultTopicFactory; +import com.linkedin.xinfra.monitor.topicfactory.TopicFactory; import java.util.Map; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; @@ -40,14 +41,24 @@ public class TopicManagementServiceConfig extends AbstractConfig { + " This config provides a loose lower bound on the partition number of the monitor topic when the topic is created or when partition is added."; public static final String TOPIC_REPLICATION_FACTOR_CONFIG = "topic-management.replicationFactor"; - public static final String TOPIC_REPLICATION_FACTOR_DOC = "When a topic is created automatically this is the " - + "replication factor used."; + public static final String TOPIC_REPLICATION_FACTOR_DOC = "This replication factor is used to create the monitor topic. " + + "The larger one of the current replication factor and the configured replication factor is used to expand partition " + + "of the monitor topic."; public static final String TOPIC_CREATION_ENABLED_CONFIG = "topic-management.topicCreationEnabled"; public static final String TOPIC_CREATION_ENABLED_DOC = String.format("When true this service automatically creates the topic named" + " in the config with replication factor %s and min ISR as max(%s - 1, 1). The partition number is determined based on %s and %s", TOPIC_REPLICATION_FACTOR_CONFIG, TOPIC_REPLICATION_FACTOR_CONFIG, PARTITIONS_TO_BROKERS_RATIO_CONFIG, MIN_PARTITION_NUM_DOC); + public static final String TOPIC_ADD_PARTITION_ENABLED_CONFIG = "topic-management.topicAddPartitionEnabled"; + public static final String TOPIC_ADD_PARTITION_ENABLED_DOC = String.format("When true this service automatically add topic partition(s) " + + "if the current topic partition count is smaller than the partition number which is determined based on %s and %s", + PARTITIONS_TO_BROKERS_RATIO_CONFIG, MIN_PARTITION_NUM_DOC); + + public static final String TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_CONFIG = "topic-management.topicReassignPartitionAndElectLeaderEnabled"; + public static final String TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_DOC = "When true this service automatically balance topic partitions in" + + " a cluster to ensure a minimum number of leader replicas on each alive broker."; + public static final String TOPIC_FACTORY_CLASS_CONFIG = "topic-management.topicFactory.class.name"; public static final String TOPIC_FACTORY_CLASS_DOC = "The name of the class used to create topics. This class must implement " + TopicFactory.class.getName() + "."; @@ -58,8 +69,16 @@ public class TopicManagementServiceConfig extends AbstractConfig { public static final String TOPIC_PROPS_CONFIG = "topic-management.topic.props"; public static final String TOPIC_PROPS_DOC = "A configuration map for the topic"; + public static final String TOPIC_MANAGEMENT_ENABLED_CONFIG = "topic-management.topicManagementEnabled"; + public static final String TOPIC_MANAGEMENT_ENABLED_DOC = "Boolean switch for enabling Topic Management Service"; + static { CONFIG = new ConfigDef() + .define(TOPIC_MANAGEMENT_ENABLED_CONFIG, + ConfigDef.Type.BOOLEAN, + true, + ConfigDef.Importance.HIGH, + TOPIC_MANAGEMENT_ENABLED_DOC) .define(ZOOKEEPER_CONNECT_CONFIG, ConfigDef.Type.STRING, ConfigDef.Importance.HIGH, @@ -83,6 +102,16 @@ public class TopicManagementServiceConfig extends AbstractConfig { true, ConfigDef.Importance.LOW, TOPIC_CREATION_ENABLED_DOC) + .define(TOPIC_ADD_PARTITION_ENABLED_CONFIG, + ConfigDef.Type.BOOLEAN, + true, + ConfigDef.Importance.LOW, + TOPIC_ADD_PARTITION_ENABLED_DOC) + .define(TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_CONFIG, + ConfigDef.Type.BOOLEAN, + true, + ConfigDef.Importance.LOW, + TOPIC_REASSIGN_PARTITION_AND_ELECT_LEADER_ENABLED_DOC) .define(TOPIC_REPLICATION_FACTOR_CONFIG, ConfigDef.Type.INT, 1, diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ClusterTopicManipulationMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ClusterTopicManipulationMetrics.java new file mode 100644 index 00000000..947177e9 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ClusterTopicManipulationMetrics.java @@ -0,0 +1,126 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.Avg; +import org.apache.kafka.common.metrics.stats.Max; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Metrics sub-class for Cluster Topic Manipulation Service that extends the parent class XinfraMonitorMetrics. + */ +public class ClusterTopicManipulationMetrics extends XinfraMonitorMetrics { + + private static final Logger LOGGER = LoggerFactory.getLogger(ClusterTopicManipulationMetrics.class); + private final Sensor _topicCreationSensor; + private final Sensor _topicDeletionSensor; + private long _topicCreationStartTimeMs; + private long _topicDeletionStartTimeMs; + public static final String METRIC_GROUP_NAME = "cluster-topic-manipulation-service"; + + /** + * + * @param metrics a named, numerical measurement. sensor is a handle to record numerical measurements as they occur. + * @param tags metrics/sensor's tags + */ + public ClusterTopicManipulationMetrics(final Metrics metrics, final Map tags) { + super(metrics, tags); + _topicCreationSensor = metrics.sensor("topic-creation-metadata-propagation"); + _topicDeletionSensor = metrics.sensor("topic-deletion-metadata-propagation"); + _topicCreationSensor.add(new MetricName("topic-creation-metadata-propagation-ms-avg", METRIC_GROUP_NAME, + "The average propagation duration in ms of propagating topic creation data and metadata to all brokers in the cluster", + tags), new Avg()); + _topicCreationSensor.add(new MetricName("topic-creation-metadata-propagation-ms-max", METRIC_GROUP_NAME, + "The maximum propagation time in ms of propagating topic creation data and metadata to all brokers in the cluster", + tags), new Max()); + _topicDeletionSensor.add(new MetricName("topic-deletion-metadata-propagation-ms-avg", METRIC_GROUP_NAME, + "The average propagation duration in milliseconds of propagating the topic deletion data and metadata " + + "across all the brokers in the cluster.", tags), new Avg()); + _topicDeletionSensor.add(new MetricName("topic-deletion-metadata-propagation-ms-max", METRIC_GROUP_NAME, + "The maximum propagation time in milliseconds of propagating the topic deletion data and metadata " + + "across all the brokers in the cluster.", tags), new Max()); + + LOGGER.debug("{} constructor was initialized successfully.", "ClusterTopicManipulationMetrics"); + } + + /** + * start measuring the topic creation process and its RPC (remote programmable client) + */ + public void startTopicCreationMeasurement() { + this.setTopicCreationStartTimeMs(System.currentTimeMillis()); + LOGGER.debug("Started measuring."); + } + + public void startTopicDeletionMeasurement() { + this.setTopicDeletionStartTimeMs(System.currentTimeMillis()); + LOGGER.debug("Started measuring the cluster topic deletion process."); + } + + /** + * + * @param millis time in milliseconds in long data type + */ + void setTopicCreationStartTimeMs(long millis) { + _topicCreationStartTimeMs = millis; + } + + /** + * + * @param millis time in milli-seconds as a long data type + */ + void setTopicDeletionStartTimeMs(long millis) { + _topicDeletionStartTimeMs = millis; + } + + /** + * + */ + public void finishTopicCreationMeasurement() { + long completedMs = System.currentTimeMillis(); + long startMs = this.topicCreationStartTimeMs(); + this._topicCreationSensor.record(completedMs - startMs); + + LOGGER.debug("Finished measuring topic creation."); + } + + public void finishTopicDeletionMeasurement() { + long completeMs = System.currentTimeMillis(); + long startMs = this.topicDeletionStartTimeMs(); + this._topicDeletionSensor.record(completeMs - startMs); + + LOGGER.debug("Finished measuring topic deletion"); + } + + /** + * + * @return the _topicCreationStartTimeMs as a long data type + */ + private long topicCreationStartTimeMs() { + return _topicCreationStartTimeMs; + } + + private long topicDeletionStartTimeMs() { + return _topicDeletionStartTimeMs; + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } +} + + diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitAvailabilityMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitAvailabilityMetrics.java new file mode 100644 index 00000000..9643a8d0 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitAvailabilityMetrics.java @@ -0,0 +1,61 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.MetricConfig; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.CumulativeSum; +import org.apache.kafka.common.metrics.stats.Rate; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class CommitAvailabilityMetrics { + + private static final String METRIC_GROUP_NAME = "commit-availability-service"; + private static final Logger LOG = LoggerFactory.getLogger(CommitAvailabilityMetrics.class); + public final Sensor _offsetsCommitted; + public final Sensor _failedCommitOffsets; + + /** + * Metrics for Calculating the offset commit availability of a consumer. + * @param metrics the commit offset metrics + * @param tags the tags associated, i.e) kmf.services:name=single-cluster-monitor + */ + public CommitAvailabilityMetrics(final Metrics metrics, final Map tags) { + LOG.info("{} called.", this.getClass().getSimpleName()); + _offsetsCommitted = metrics.sensor("offsets-committed"); + _offsetsCommitted.add(new MetricName("offsets-committed-total", METRIC_GROUP_NAME, + "The total number of offsets per second that are committed.", tags), new CumulativeSum()); + + _failedCommitOffsets = metrics.sensor("failed-commit-offsets"); + _failedCommitOffsets.add(new MetricName("failed-commit-offsets-avg", METRIC_GROUP_NAME, + "The average number of offsets per second that have failed.", tags), new Rate()); + _failedCommitOffsets.add(new MetricName("failed-commit-offsets-total", METRIC_GROUP_NAME, + "The total number of offsets per second that have failed.", tags), new CumulativeSum()); + + metrics.addMetric(new MetricName("offsets-committed-avg", METRIC_GROUP_NAME, "The average offset commits availability.", tags), + (MetricConfig config, long now) -> { + Object offsetCommitTotal = metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue(); + Object offsetCommitFailTotal = metrics.metrics().get(metrics.metricName("failed-commit-offsets-total", METRIC_GROUP_NAME, tags)).metricValue(); + if (offsetCommitTotal != null && offsetCommitFailTotal != null) { + double offsetsCommittedCount = (double) offsetCommitTotal; + double offsetsCommittedErrorCount = (double) offsetCommitFailTotal; + return offsetsCommittedCount / (offsetsCommittedCount + offsetsCommittedErrorCount); + } else { + return 0; + } + }); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitLatencyMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitLatencyMetrics.java new file mode 100644 index 00000000..495ea545 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/CommitLatencyMetrics.java @@ -0,0 +1,105 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.Avg; +import org.apache.kafka.common.metrics.stats.Max; +import org.apache.kafka.common.metrics.stats.Percentile; +import org.apache.kafka.common.metrics.stats.Percentiles; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * The CommitLatencyMetrics class contains methods that measures and + * determines the latency of Kafka consumer offset commit(). + */ +public class CommitLatencyMetrics { + private static final String METRIC_GROUP_NAME = "commit-latency-service"; + private static final Logger LOG = LoggerFactory.getLogger(CommitLatencyMetrics.class); + private final Sensor _commitOffsetLatency; + private long _commitStartTimeMs; + private volatile boolean _inProgressCommit; + + /** + * Metrics for Calculating the offset commit latency of a consumer. + * @param metrics the commit offset metrics + * @param tags the tags associated, i.e) kmf.services:name=single-cluster-monitor + */ + public CommitLatencyMetrics(Metrics metrics, Map tags, int latencyPercentileMaxMs, + int latencyPercentileGranularityMs) { + _inProgressCommit = false; + _commitOffsetLatency = metrics.sensor("commit-offset-latency"); + _commitOffsetLatency.add(new MetricName("commit-offset-latency-ms-avg", METRIC_GROUP_NAME, "The average latency in ms of committing offset", tags), new Avg()); + _commitOffsetLatency.add(new MetricName("commit-offset-latency-ms-max", METRIC_GROUP_NAME, "The maximum latency in ms of committing offset", tags), new Max()); + + if (latencyPercentileGranularityMs == 0) { + throw new IllegalArgumentException("The latency percentile granularity was incorrectly passed a zero value."); + } + + // 2 extra buckets exist which are respectively designated for values which are less than 0.0 or larger than max. + int bucketNum = latencyPercentileMaxMs / latencyPercentileGranularityMs + 2; + int sizeInBytes = bucketNum * 4; + _commitOffsetLatency.add(new Percentiles(sizeInBytes, latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT, + new Percentile(new MetricName("commit-offset-latency-ms-99th", METRIC_GROUP_NAME, "The 99th percentile latency of committing offset", tags), 99.0), + new Percentile(new MetricName("commit-offset-latency-ms-999th", METRIC_GROUP_NAME, "The 99.9th percentile latency of committing offset", tags), 99.9), + new Percentile(new MetricName("commit-offset-latency-ms-9999th", METRIC_GROUP_NAME, "The 99.99th percentile latency of committing offset", tags), 99.99))); + LOG.info("{} was constructed successfully.", this.getClass().getSimpleName()); + } + + /** + * start the recording of consumer offset commit + */ + public void recordCommitStart() { + if (!_inProgressCommit) { + this.setCommitStartTimeMs(System.currentTimeMillis()); + _inProgressCommit = true; + } else { + // inProgressCommit is already set to TRUE; + LOG.debug("Offset commit is already in progress."); + } + } + + /** + * finish the recording of consumer offset commit + */ + public void recordCommitComplete() { + if (_inProgressCommit) { + long commitCompletedMs = System.currentTimeMillis(); + long commitStartMs = this.commitStartTimeMs(); + this._commitOffsetLatency.record(commitCompletedMs - commitStartMs); + _inProgressCommit = false; + } else { + // inProgressCommit is already set to FALSE; + LOG.debug("Offset commit is not in progress. CommitLatencyMetrics shouldn't completing a record commit here."); + } + } + + /** + * set in milliseconds the start time of consumer offset commit + * @param time commit start time in ms + */ + public void setCommitStartTimeMs(long time) { + _commitStartTimeMs = time; + } + + /** + * retrieve the start time of consumer offset commit + * @return _commitStartTimeMs + */ + public long commitStartTimeMs() { + return _commitStartTimeMs; + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ConsumeMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ConsumeMetrics.java new file mode 100644 index 00000000..82d902e9 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ConsumeMetrics.java @@ -0,0 +1,92 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.Avg; +import org.apache.kafka.common.metrics.stats.CumulativeSum; +import org.apache.kafka.common.metrics.stats.Max; +import org.apache.kafka.common.metrics.stats.Percentile; +import org.apache.kafka.common.metrics.stats.Percentiles; +import org.apache.kafka.common.metrics.stats.Rate; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class ConsumeMetrics { + public final Sensor _consumeError; + public final Sensor _bytesConsumed; + public final Sensor _recordsConsumed; + public final Sensor _recordsDuplicated; + public final Sensor _recordsLost; + public final Sensor _recordsDelay; + public final Sensor _recordsDelayed; + private static final String METRIC_GROUP_NAME = "consume-service"; + private static final Logger LOG = LoggerFactory.getLogger(ConsumeMetrics.class); + + public ConsumeMetrics(final Metrics metrics, Map tags, int latencyPercentileMaxMs, + int latencyPercentileGranularityMs) { + + _bytesConsumed = metrics.sensor("bytes-consumed"); + _bytesConsumed.add(new MetricName("bytes-consumed-rate", METRIC_GROUP_NAME, "The average number of bytes per second that are consumed", tags), new Rate()); + + _consumeError = metrics.sensor("consume-error"); + _consumeError.add(new MetricName("consume-error-rate", METRIC_GROUP_NAME, "The average number of errors per second", tags), new Rate()); + _consumeError.add(new MetricName("consume-error-total", METRIC_GROUP_NAME, "The total number of errors", tags), new CumulativeSum()); + + _recordsConsumed = metrics.sensor("records-consumed"); + _recordsConsumed.add(new MetricName("records-consumed-rate", METRIC_GROUP_NAME, "The average number of records per second that are consumed", tags), new Rate()); + _recordsConsumed.add(new MetricName("records-consumed-total", METRIC_GROUP_NAME, "The total number of records that are consumed", tags), new CumulativeSum()); + + _recordsDuplicated = metrics.sensor("records-duplicated"); + _recordsDuplicated.add(new MetricName("records-duplicated-rate", METRIC_GROUP_NAME, "The average number of records per second that are duplicated", tags), new Rate()); + _recordsDuplicated.add(new MetricName("records-duplicated-total", METRIC_GROUP_NAME, "The total number of records that are duplicated", tags), new CumulativeSum()); + + _recordsLost = metrics.sensor("records-lost"); + _recordsLost.add(new MetricName("records-lost-rate", METRIC_GROUP_NAME, "The average number of records per second that are lost", tags), new Rate()); + _recordsLost.add(new MetricName("records-lost-total", METRIC_GROUP_NAME, "The total number of records that are lost", tags), new CumulativeSum()); + + _recordsDelayed = metrics.sensor("records-delayed"); + _recordsDelayed.add(new MetricName("records-delayed-rate", METRIC_GROUP_NAME, "The average number of records per second that are either lost or arrive after maximum allowed latency under SLA", tags), new Rate()); + _recordsDelayed.add(new MetricName("records-delayed-total", METRIC_GROUP_NAME, "The total number of records that are either lost or arrive after maximum allowed latency under SLA", tags), new CumulativeSum()); + + _recordsDelay = metrics.sensor("records-delay"); + _recordsDelay.add(new MetricName("records-delay-ms-avg", METRIC_GROUP_NAME, "The average latency of records from producer to consumer", tags), new Avg()); + _recordsDelay.add(new MetricName("records-delay-ms-max", METRIC_GROUP_NAME, "The maximum latency of records from producer to consumer", tags), new Max()); + + // There are 2 extra buckets use for values smaller than 0.0 or larger than max, respectively. + int bucketNum = latencyPercentileMaxMs / latencyPercentileGranularityMs + 2; + int sizeInBytes = 4 * bucketNum; + _recordsDelay.add(new Percentiles(sizeInBytes, latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT, + new Percentile(new MetricName("records-delay-ms-99th", METRIC_GROUP_NAME, "The 99th percentile latency of records from producer to consumer", tags), 99.0), + new Percentile(new MetricName("records-delay-ms-999th", METRIC_GROUP_NAME, "The 99.9th percentile latency of records from producer to consumer", tags), 99.9), + new Percentile(new MetricName("records-delay-ms-9999th", METRIC_GROUP_NAME, "The 99.99th percentile latency of records from producer to consumer", tags), 99.99))); + + metrics.addMetric(new MetricName("consume-availability-avg", METRIC_GROUP_NAME, "The average consume availability", tags), + (config, now) -> { + double recordsConsumedRate = (double) metrics.metrics().get(metrics.metricName("records-consumed-rate", METRIC_GROUP_NAME, tags)).metricValue(); + double recordsLostRate = (double) metrics.metrics().get(metrics.metricName("records-lost-rate", METRIC_GROUP_NAME, tags)).metricValue(); + double recordsDelayedRate = (double) metrics.metrics().get(metrics.metricName("records-delayed-rate", METRIC_GROUP_NAME, tags)).metricValue(); + + if (new Double(recordsLostRate).isNaN()) + recordsLostRate = 0; + if (new Double(recordsDelayedRate).isNaN()) + recordsDelayedRate = 0; + + return recordsConsumedRate + recordsLostRate > 0 + ? (recordsConsumedRate - recordsDelayedRate) / (recordsConsumedRate + recordsLostRate) : 0; + } + ); + } +} diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/OffsetCommitServiceMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/OffsetCommitServiceMetrics.java new file mode 100644 index 00000000..b6a6e753 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/OffsetCommitServiceMetrics.java @@ -0,0 +1,113 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.Measurable; +import org.apache.kafka.common.metrics.MetricConfig; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.Avg; +import org.apache.kafka.common.metrics.stats.CumulativeSum; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class OffsetCommitServiceMetrics extends XinfraMonitorMetrics { + + private final Sensor _offsetCommittedSensor; + private final Sensor _offsetCommitFailSensor; + private static final Logger LOGGER = LoggerFactory.getLogger(OffsetCommitServiceMetrics.class); + private static final String METRIC_GROUP_NAME = "offset-commit-service"; + private static final String SUCCESS_SENSOR_NAME = "offset-commit-service-success"; + private static final String SUCCESS_RATE_METRIC = "offset-commit-service-success-rate"; + private static final String SUCCESS_METRIC_TOTAL = "offset-commit-service-success-total"; + private static final String FAILURE_SENSOR_NAME = "offset-commit-service-failure"; + private static final String FAILURE_RATE_METRIC = "offset-commit-service-failure-rate"; + private static final String FAILURE_METRIC_TOTAL = "offset-commit-service-failure-total"; + + /** + * + * @param metrics a named, numerical measurement. + * Sensor is a handle to record numerical measurements as they occur. + * @param tags metrics/sensor's tags + */ + public OffsetCommitServiceMetrics(final Metrics metrics, final Map tags) { + super(metrics, tags); + _offsetCommittedSensor = metrics.sensor(SUCCESS_SENSOR_NAME); + _offsetCommittedSensor.add(new MetricName(SUCCESS_RATE_METRIC, METRIC_GROUP_NAME, + "The success rate of group coordinator accepting consumer offset commit requests.", tags), new Avg()); + _offsetCommittedSensor.add(new MetricName(SUCCESS_METRIC_TOTAL, METRIC_GROUP_NAME, + "The total count of group coordinator successfully accepting consumer offset commit requests.", tags), + new CumulativeSum()); + + _offsetCommitFailSensor = metrics.sensor(FAILURE_SENSOR_NAME); + /* NaN will persist as long as no record is submitted to the failure sensor. + we'll continue with NaN for now since we'd rather that the Sensor itself is a true and unaltered record of what values it recorded. */ + _offsetCommitFailSensor.add(new MetricName(FAILURE_RATE_METRIC, METRIC_GROUP_NAME, + "The failure rate of group coordinator accepting consumer offset commit requests.", tags), new Avg()); + _offsetCommitFailSensor.add(new MetricName(FAILURE_METRIC_TOTAL, METRIC_GROUP_NAME, + "The total count of group coordinator unsuccessfully receiving consumer offset commit requests.", tags), + new CumulativeSum()); + + Measurable measurable = new Measurable() { + @Override + public double measure(MetricConfig config, long now) { + double offsetCommitSuccessRate = (double) metrics.metrics() + .get(metrics.metricName(SUCCESS_RATE_METRIC, METRIC_GROUP_NAME, tags)) + .metricValue(); + double offsetCommitFailureRate = (double) metrics.metrics() + .get(metrics.metricName(FAILURE_RATE_METRIC, METRIC_GROUP_NAME, tags)) + .metricValue(); + + if (new Double(offsetCommitSuccessRate).isNaN()) { + offsetCommitSuccessRate = 0; + } + + if (new Double(offsetCommitFailureRate).isNaN()) { + offsetCommitFailureRate = 0; + } + + return offsetCommitSuccessRate + offsetCommitFailureRate > 0 ? offsetCommitSuccessRate / ( + offsetCommitSuccessRate + offsetCommitFailureRate) : 0; + } + }; + + metrics.addMetric(new MetricName("offset-commit-availability-avg", METRIC_GROUP_NAME, + "The average offset commit availability with respect to the group coordinator.", tags), measurable); + } + + /** + * start measuring and its RPC (remote programmable client) + */ + public void recordSuccessful() { + _offsetCommittedSensor.record(); + LOGGER.debug("recorded successful."); + } + + public void recordFailed() { + _offsetCommitFailSensor.record(); + LOGGER.error("The offset commit failed due to the response future failing and the future NOT being retriable."); + } + + public void recordUnavailable() { + _offsetCommitFailSensor.record(); + LOGGER.error("The offset commit failed due to coordinator being unavailable."); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } +} + + diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ProduceMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ProduceMetrics.java new file mode 100644 index 00000000..1ce7202e --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/ProduceMetrics.java @@ -0,0 +1,139 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import com.linkedin.xinfra.monitor.XinfraMonitorConstants; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.kafka.common.MetricName; +import org.apache.kafka.common.metrics.Metrics; +import org.apache.kafka.common.metrics.Sensor; +import org.apache.kafka.common.metrics.stats.Avg; +import org.apache.kafka.common.metrics.stats.CumulativeSum; +import org.apache.kafka.common.metrics.stats.Max; +import org.apache.kafka.common.metrics.stats.Percentile; +import org.apache.kafka.common.metrics.stats.Percentiles; +import org.apache.kafka.common.metrics.stats.Rate; + + +public class ProduceMetrics { + + public final Metrics _metrics; + public final Sensor _recordsProduced; + public final Sensor _produceError; + public final Sensor _produceDelay; + public final ConcurrentMap _recordsProducedPerPartition; + public final ConcurrentMap _produceErrorPerPartition; + public final ConcurrentMap _produceErrorInLastSendPerPartition; + private final Map _tags; + + public ProduceMetrics(final Metrics metrics, final Map tags, int latencyPercentileGranularityMs, + int latencyPercentileMaxMs, AtomicInteger partitionNumber, boolean treatZeroThroughputAsUnavailable) { + _metrics = metrics; + _tags = tags; + + _recordsProducedPerPartition = new ConcurrentHashMap<>(); + _produceErrorPerPartition = new ConcurrentHashMap<>(); + _produceErrorInLastSendPerPartition = new ConcurrentHashMap<>(); + + _recordsProduced = metrics.sensor("records-produced"); + _recordsProduced.add( + new MetricName("records-produced-rate", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average number of records per second that are produced", tags), new Rate()); + _recordsProduced.add( + new MetricName("records-produced-total", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The total number of records that are produced", tags), new CumulativeSum()); + + _produceError = metrics.sensor("produce-error"); + _produceError.add(new MetricName("produce-error-rate", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average number of errors per second", tags), new Rate()); + _produceError.add(new MetricName("produce-error-total", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The total number of errors", tags), new CumulativeSum()); + + _produceDelay = metrics.sensor("produce-delay"); + _produceDelay.add(new MetricName("produce-delay-ms-avg", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average delay in ms for produce request", tags), new Avg()); + _produceDelay.add(new MetricName("produce-delay-ms-max", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The maximum delay in ms for produce request", tags), new Max()); + + // There are 2 extra buckets use for values smaller than 0.0 or larger than max, respectively. + int bucketNum = latencyPercentileMaxMs / latencyPercentileGranularityMs + 2; + int sizeInBytes = 4 * bucketNum; + _produceDelay.add(new Percentiles(sizeInBytes, latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT, + new Percentile(new MetricName("produce-delay-ms-99th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The 99th percentile delay in ms for produce request", tags), 99.0), new Percentile( + new MetricName("produce-delay-ms-999th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The 99.9th percentile delay in ms for produce request", tags), 99.9), new Percentile( + new MetricName("produce-delay-ms-9999th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The 99.99th percentile delay in ms for produce request", tags), 99.99))); + + metrics.addMetric( + new MetricName("produce-availability-avg", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average produce availability", tags), (config, now) -> { + double availabilitySum = 0.0; + int partitionNum = partitionNumber.get(); + for (int partition = 0; partition < partitionNum; partition++) { + double recordsProduced = (double) metrics.metrics() + .get(metrics.metricName("records-produced-rate-partition-" + partition, + XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, tags)) + .metricValue(); + double produceError = (double) metrics.metrics() + .get(metrics.metricName("produce-error-rate-partition-" + partition, + XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, tags)) + .metricValue(); + // If there is no error, error rate sensor may expire and the value may be NaN. Treat NaN as 0 for error rate. + if (Double.isNaN(produceError) || Double.isInfinite(produceError)) { + produceError = 0; + } + // If there is either succeeded or failed produce to a partition, consider its availability as 0. + if (recordsProduced + produceError > 0) { + availabilitySum += recordsProduced / (recordsProduced + produceError); + } else if (!treatZeroThroughputAsUnavailable) { + // If user configures treatZeroThroughputAsUnavailable to be false, a partition's availability + // is 1.0 as long as there is no exception thrown from producer. + // This allows kafka admin to exactly monitor the availability experienced by Kafka users which + // will block and retry for a certain amount of time based on its configuration (e.g. retries, retry.backoff.ms). + // Note that if it takes a long time for messages to be retries and sent, the latency in the ConsumeService + // will increase and it will reduce ConsumeAvailability if the latency exceeds consume.latency.sla.ms + // If timeout is set to more than 60 seconds (the current samples window duration), + // the error sample might be expired before the next error can be produced. + // In order to detect offline partition with high producer timeout config, the error status during last + // send is also checked before declaring 1.0 availability for the partition. + Boolean lastSendError = _produceErrorInLastSendPerPartition.get(partition); + if (lastSendError == null || !lastSendError) { + availabilitySum += 1.0; + } + } + } + + // Assign equal weight to per-partition availability when calculating overall availability + return availabilitySum / partitionNum; + } + ); + } + + public void addPartitionSensors(int partition) { + Sensor recordsProducedSensor = _metrics.sensor("records-produced-partition-" + partition); + recordsProducedSensor.add(new MetricName("records-produced-rate-partition-" + partition, + XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average number of records per second that are produced to this partition", _tags), new Rate()); + _recordsProducedPerPartition.put(partition, recordsProducedSensor); + + Sensor errorsSensor = _metrics.sensor("produce-error-partition-" + partition); + errorsSensor.add(new MetricName("produce-error-rate-partition-" + partition, + XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, + "The average number of errors per second when producing to this partition", _tags), new Rate()); + _produceErrorPerPartition.put(partition, errorsSensor); + } +} + diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/metrics/XinfraMonitorMetrics.java b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/XinfraMonitorMetrics.java new file mode 100644 index 00000000..a6cc8cee --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/services/metrics/XinfraMonitorMetrics.java @@ -0,0 +1,35 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services.metrics; + +import java.util.Map; +import org.apache.kafka.common.metrics.Metrics; + + +/** + * Parent class for Metrics child classes that can be extended by subclasses. + */ +class XinfraMonitorMetrics { + + final Metrics _metrics; + final Map _tags; + + /** + * + * @param metrics a named, numerical measurement. sensor is a handle to record numerical measurements as they occur. + * @param tags metrics/sensor's tags + */ + XinfraMonitorMetrics(Metrics metrics, Map tags) { + _metrics = metrics; + _tags = tags; + } + +} diff --git a/src/main/java/com/linkedin/kmf/tests/BasicEndToEndTest.java b/src/main/java/com/linkedin/xinfra/monitor/tests/BasicEndToEndTest.java similarity index 63% rename from src/main/java/com/linkedin/kmf/tests/BasicEndToEndTest.java rename to src/main/java/com/linkedin/xinfra/monitor/tests/BasicEndToEndTest.java index bd1ff8d1..4a8729e6 100644 --- a/src/main/java/com/linkedin/kmf/tests/BasicEndToEndTest.java +++ b/src/main/java/com/linkedin/xinfra/monitor/tests/BasicEndToEndTest.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,15 +7,19 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.tests; -import com.linkedin.kmf.apps.SingleClusterMonitor; -import com.linkedin.kmf.services.TopicManagementService; -import com.linkedin.kmf.services.ConsumeService; -import com.linkedin.kmf.services.ProduceService; +package com.linkedin.xinfra.monitor.tests; + +import com.linkedin.xinfra.monitor.apps.SingleClusterMonitor; +import com.linkedin.xinfra.monitor.services.ConsumeService; +import com.linkedin.xinfra.monitor.services.ConsumerFactoryImpl; +import com.linkedin.xinfra.monitor.services.ProduceService; +import com.linkedin.xinfra.monitor.services.TopicManagementService; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Map; /* @@ -43,16 +47,24 @@ public class BasicEndToEndTest implements Test { public BasicEndToEndTest(Map props, String name) throws Exception { _name = name; _topicManagementService = new TopicManagementService(props, name); + CompletableFuture topicPartitionReady = _topicManagementService.topicPartitionResult(); _produceService = new ProduceService(props, name); - _consumeService = new ConsumeService(props, name); + ConsumerFactoryImpl consumerFactory = new ConsumerFactoryImpl(props); + _consumeService = new ConsumeService(name, topicPartitionReady, consumerFactory); } @Override public void start() { _topicManagementService.start(); - _produceService.start(); - _consumeService.start(); - LOG.info(_name + "/BasicEndToEndTest started"); + CompletableFuture topicPartitionResult = _topicManagementService.topicPartitionResult(); + topicPartitionResult.thenRun(() -> { + try { + _produceService.start(); + _consumeService.start(); + } finally { + LOG.info("{} /BasicEndToEndTest started.", _name); + } + }); } @Override @@ -60,7 +72,7 @@ public void stop() { _topicManagementService.stop(); _produceService.stop(); _consumeService.stop(); - LOG.info(_name + "/BasicEndToEndTest stopped"); + LOG.info("{} /BasicEndToEndTest stopped.", _name); } @Override @@ -70,9 +82,9 @@ public boolean isRunning() { @Override public void awaitShutdown() { - _topicManagementService.awaitShutdown(); - _produceService.awaitShutdown(); - _consumeService.awaitShutdown(); + _topicManagementService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + _produceService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + _consumeService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); } public static void main(String[] args) throws Exception { diff --git a/src/main/java/com/linkedin/kmf/tests/Test.java b/src/main/java/com/linkedin/xinfra/monitor/tests/Test.java similarity index 79% rename from src/main/java/com/linkedin/kmf/tests/Test.java rename to src/main/java/com/linkedin/xinfra/monitor/tests/Test.java index 09e6a10a..6da22e66 100644 --- a/src/main/java/com/linkedin/kmf/tests/Test.java +++ b/src/main/java/com/linkedin/xinfra/monitor/tests/Test.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,9 +7,9 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.tests; +package com.linkedin.xinfra.monitor.tests; -import com.linkedin.kmf.apps.App; +import com.linkedin.xinfra.monitor.apps.App; /** * @deprecated This interface has been deprecated and will be removed in a future release. Please use com.linkedin.kmf.apps.App instead. diff --git a/src/main/java/com/linkedin/xinfra/monitor/topicfactory/DefaultTopicFactory.java b/src/main/java/com/linkedin/xinfra/monitor/topicfactory/DefaultTopicFactory.java new file mode 100644 index 00000000..8b2ddef7 --- /dev/null +++ b/src/main/java/com/linkedin/xinfra/monitor/topicfactory/DefaultTopicFactory.java @@ -0,0 +1,38 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.topicfactory; + +import com.linkedin.xinfra.monitor.common.Utils; +import java.util.Collections; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; + + +public class DefaultTopicFactory implements TopicFactory { + + /** This constructor is required by TopicFactory but does nothing. */ + public DefaultTopicFactory(Map config) { + } + + @Override + public int createTopicIfNotExist(String topic, short replicationFactor, double partitionToBrokerRatio, Properties topicConfig, AdminClient adminClient) + throws ExecutionException, InterruptedException { + return Utils.createTopicIfNotExists(topic, replicationFactor, partitionToBrokerRatio, 1, topicConfig, adminClient); + } + + @Override + public Set getExcludedBrokers(AdminClient adminClient) { + return Collections.emptySet(); + } +} diff --git a/src/main/java/com/linkedin/kmf/topicfactory/TopicFactory.java b/src/main/java/com/linkedin/xinfra/monitor/topicfactory/TopicFactory.java similarity index 61% rename from src/main/java/com/linkedin/kmf/topicfactory/TopicFactory.java rename to src/main/java/com/linkedin/xinfra/monitor/topicfactory/TopicFactory.java index 3f4c59f3..b3d0a706 100644 --- a/src/main/java/com/linkedin/kmf/topicfactory/TopicFactory.java +++ b/src/main/java/com/linkedin/xinfra/monitor/topicfactory/TopicFactory.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -7,15 +7,19 @@ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -package com.linkedin.kmf.topicfactory; + +package com.linkedin.xinfra.monitor.topicfactory; import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; /** - * Constructs the monitor topic if it does not exist. + * Constructs the monitor topic if it does not exist, and provide blacklisted brokers info for topic management service * - * Implementations of this class should have a public constructor with the following signature:
+ * Implementations of this class should have a public constructor with the following signature: * Constructor(Map<String, ?> config) where config are additional configuration parameters passed in from the Kafka * Monitor configuration. */ @@ -23,7 +27,6 @@ public interface TopicFactory { /** * Creates the specified topic if it does not exist. - * @param zkUrl zookeeper connection url * @param topic topic name * @param replicationFactor the replication factor for the topic * @param partitionToBrokerRatio This is multiplied by the number brokers to compute the number of partitions in the topic. @@ -32,6 +35,13 @@ public interface TopicFactory { * @return The number of partitions for the specified topic. */ - int createTopicIfNotExist(String zkUrl, String topic, int replicationFactor, double partitionToBrokerRatio, Properties topicProperties); + int createTopicIfNotExist(String topic, short replicationFactor, double partitionToBrokerRatio, Properties topicProperties, AdminClient adminClient) + throws ExecutionException, InterruptedException; + + /** + * @param adminClient AdminClient object + * @return A set of brokers that don't take new partitions or reassigned partitions for topics. + */ + Set getExcludedBrokers(AdminClient adminClient); } diff --git a/src/test/java/com/linkedin/kmf/KafkaMonitorTest.java b/src/test/java/com/linkedin/kmf/KafkaMonitorTest.java deleted file mode 100644 index f1232d51..00000000 --- a/src/test/java/com/linkedin/kmf/KafkaMonitorTest.java +++ /dev/null @@ -1,137 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf; - -import com.linkedin.kmf.services.Service; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; - -import java.util.concurrent.atomic.AtomicReference; -import org.testng.annotations.Test; - - -@Test -public class KafkaMonitorTest { - - @Test - public void lifecycleTest() throws Exception { - KafkaMonitor kafkaMonitor = kafkaMonitor(); - - // Nothing should be started - assertEquals(FakeService.startCount.get(), 0); - assertEquals(FakeService.stopCount.get(), 0); - - // Should accept but ignore start because start has not been called - kafkaMonitor.stop(); - assertEquals(FakeService.stopCount.get(), 0); - - // Should start - kafkaMonitor.start(); - assertEquals(FakeService.startCount.get(), 1); - - // Should allow start to be called more than once - kafkaMonitor.stop(); - kafkaMonitor.stop(); - assertEquals(FakeService.startCount.get(), 1); - assertEquals(FakeService.stopCount.get(), 1); - - // Should be allowed to shutdown more than once. - kafkaMonitor.awaitShutdown(); - kafkaMonitor.awaitShutdown(); - } - - @Test - public void awaitShutdownOtherThread() throws Exception { - final KafkaMonitor kafkaMonitor = kafkaMonitor(); - final AtomicReference error = new AtomicReference<>(); - - Thread t = new Thread("test awaitshutdown thread") { - @Override - public void run() { - try { - kafkaMonitor.awaitShutdown(); - } catch (Throwable t) { - error.set(t); - } - } - }; - - t.start(); - kafkaMonitor.start(); - Thread.sleep(100); - kafkaMonitor.stop(); - t.join(500); - assertFalse(t.isAlive()); - assertEquals(error.get(), null); - } - - private KafkaMonitor kafkaMonitor() throws Exception { - FakeService.clearCounters(); - Map config = new HashMap<>(); - Map fakeServiceConfig = new HashMap<>(); - config.put("fake-service", fakeServiceConfig); - fakeServiceConfig.put(KafkaMonitor.CLASS_NAME_CONFIG, FakeService.class.getName()); - return new KafkaMonitor(config); - } - - - static final class FakeService implements Service { - - private static AtomicInteger startCount = new AtomicInteger(); - private static AtomicInteger stopCount = new AtomicInteger(); - private final AtomicBoolean _isRunning = new AtomicBoolean(); - - /** required */ - public FakeService(Map config, String serviceInstanceName) { - - } - - private static void clearCounters() { - startCount.set(0); - stopCount.set(0); - } - - @Override - public void start() { - _isRunning.compareAndSet(false, true); - startCount.incrementAndGet(); - } - - @Override - public synchronized void stop() { - _isRunning.compareAndSet(true, false); - stopCount.incrementAndGet(); - this.notifyAll(); - } - - @Override - public boolean isRunning() { - return _isRunning.get(); - } - - @Override - public synchronized void awaitShutdown() { - try { - if (stopCount.get() == 0) { - wait(3_000); - if (stopCount.get() == 0) { - throw new IllegalStateException("Never notified."); - } - } - } catch (InterruptedException e) { - throw new IllegalStateException(e); - } - } - } -} diff --git a/src/test/java/com/linkedin/kmf/services/TopicManagementServiceTest.java b/src/test/java/com/linkedin/kmf/services/TopicManagementServiceTest.java deleted file mode 100644 index 2e997c24..00000000 --- a/src/test/java/com/linkedin/kmf/services/TopicManagementServiceTest.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -package com.linkedin.kmf.services; - -import java.util.ArrayList; -import java.util.List; -import kafka.cluster.Broker; -import org.apache.kafka.common.Node; -import org.apache.kafka.common.PartitionInfo; -import org.testng.Assert; -import org.testng.annotations.Test; -import com.linkedin.kmf.services.MultiClusterTopicManagementService.TopicManagementHelper; - -@Test -public class TopicManagementServiceTest { - - private static final String TOPIC = "kmf-unit-test-topic"; - - private List brokers(int brokerCount) { - List brokers = new ArrayList<>(); - for (int i = 0; i < brokerCount; i++) { - brokers.add(new Broker(i, "", -1, null)); - } - return brokers; - } - - private Node[] nodes(int brokerCount) { - Node[] nodes = new Node[brokerCount]; - for (int i = 0; i < brokerCount; i++) { - nodes[i] = new Node(i, "", -1); - } - return nodes; - } - - @Test - public void noDetection() { - List partitions = new ArrayList<>(); - Node[] node = nodes(2); - partitions.add(new PartitionInfo(TOPIC, 0, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 1, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 2, node[1], new Node[] {node[1], node[0]}, null)); - partitions.add(new PartitionInfo(TOPIC, 3, node[1], new Node[] {node[1], node[0]}, null)); - - Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(2))); - Assert.assertFalse(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(2))); - } - - @Test - public void detectLowTotalNumberOfPartitions() { - List partitions = new ArrayList<>(); - Node[] node = nodes(3); - partitions.add(new PartitionInfo(TOPIC, 0, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 1, node[1], new Node[] {node[1], node[0]}, null)); - partitions.add(new PartitionInfo(TOPIC, 2, node[2], new Node[] {node[2], node[0]}, null)); - - Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); - Assert.assertFalse(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); - Assert.assertEquals(TopicManagementHelper.getReplicationFactor(partitions), 2); - } - - - @Test - public void detectBrokerWithoutLeader() { - List partitions = new ArrayList<>(); - Node[] node = nodes(3); - partitions.add(new PartitionInfo(TOPIC, 0, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 1, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 2, node[1], new Node[] {node[1], node[0]}, null)); - partitions.add(new PartitionInfo(TOPIC, 3, node[1], new Node[] {node[2], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 4, node[1], new Node[] {node[2], node[0]}, null)); - - Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); - Assert.assertTrue(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); - } - - @Test - public void detectBrokerWithoutPreferredLeader() { - List partitions = new ArrayList<>(); - Node[] node = nodes(3); - partitions.add(new PartitionInfo(TOPIC, 0, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 1, node[0], new Node[] {node[0], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 2, node[1], new Node[] {node[0], node[0]}, null)); - partitions.add(new PartitionInfo(TOPIC, 3, node[1], new Node[] {node[2], node[1]}, null)); - partitions.add(new PartitionInfo(TOPIC, 4, node[1], new Node[] {node[2], node[0]}, null)); - - Assert.assertTrue(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); - Assert.assertTrue(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); - } -} diff --git a/src/test/java/com/linkedin/xinfra/monitor/XinfraMonitorTest.java b/src/test/java/com/linkedin/xinfra/monitor/XinfraMonitorTest.java new file mode 100644 index 00000000..9867718d --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/XinfraMonitorTest.java @@ -0,0 +1,164 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor; + +import com.linkedin.xinfra.monitor.services.ServiceFactory; +import com.linkedin.xinfra.monitor.services.Service; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import org.testng.annotations.Test; + + +@Test +public class XinfraMonitorTest { + + @Test + public void lifecycleTest() throws Exception { + XinfraMonitor xinfraMonitor = xinfraMonitor(); + + /* Nothing should be started */ + org.testng.Assert.assertEquals(FakeService.START_COUNT.get(), 0); + org.testng.Assert.assertEquals(FakeService.STOP_COUNT.get(), 0); + + /* Should accept but ignore start because start has not been called */ + + xinfraMonitor.stop(); + org.testng.Assert.assertEquals(FakeService.STOP_COUNT.get(), 0); + + /* Should start */ + xinfraMonitor.start(); + org.testng.Assert.assertEquals(FakeService.START_COUNT.get(), 1); + + /* Should allow start to be called more than once */ + xinfraMonitor.stop(); + xinfraMonitor.stop(); + org.testng.Assert.assertEquals(FakeService.START_COUNT.get(), 1); + org.testng.Assert.assertEquals(FakeService.STOP_COUNT.get(), 1); + + + /* Should be allowed to shutdown more than once. */ + xinfraMonitor.awaitShutdown(); + xinfraMonitor.awaitShutdown(); + } + + @Test + public void awaitShutdownOtherThread() throws Exception { + final XinfraMonitor xinfraMonitor = xinfraMonitor(); + final AtomicReference error = new AtomicReference<>(); + + Thread t = new Thread("test awaitshutdown thread") { + @Override + public void run() { + try { + xinfraMonitor.awaitShutdown(); + } catch (Throwable t) { + error.set(t); + } + } + }; + + t.start(); + xinfraMonitor.start(); + Thread.sleep(100); + xinfraMonitor.stop(); + t.join(500); + org.testng.Assert.assertFalse(t.isAlive()); + org.testng.Assert.assertEquals(error.get(), null); + } + + private XinfraMonitor xinfraMonitor() throws Exception { + FakeService.clearCounters(); + Map config = new HashMap<>(); + Map fakeServiceConfig = new HashMap<>(); + + fakeServiceConfig.put(XinfraMonitorConstants.CLASS_NAME_CONFIG, FakeService.class.getName()); + config.put("fake-service", fakeServiceConfig); + return new XinfraMonitor(config); + + } + + /** + * Factory class which instantiates a new FakeService service object. + */ + @SuppressWarnings("rawtypes") + static final class FakeServiceFactory implements ServiceFactory { + + private final Map _config; + private final String _serviceInstanceName; + + public FakeServiceFactory(Map config, String serviceInstanceName) { + + this._config = config; + this._serviceInstanceName = serviceInstanceName; + } + + @SuppressWarnings("unchecked") + @Override + public Service createService() throws Exception { + + return new XinfraMonitorTest.FakeService(_config, _serviceInstanceName); + + } + } + + static final class FakeService implements Service { + + private static final AtomicInteger START_COUNT = new AtomicInteger(); + private static final AtomicInteger STOP_COUNT = new AtomicInteger(); + private final AtomicBoolean _isRunning = new AtomicBoolean(); + + /** required */ + public FakeService(Map config, String serviceInstanceName) { + + } + + private static void clearCounters() { + START_COUNT.set(0); + STOP_COUNT.set(0); + } + + @Override + public void start() { + _isRunning.compareAndSet(false, true); + START_COUNT.incrementAndGet(); + } + + @Override + public synchronized void stop() { + _isRunning.compareAndSet(true, false); + STOP_COUNT.incrementAndGet(); + notifyAll(); + } + + @Override + public boolean isRunning() { + return _isRunning.get(); + } + + @Override + public synchronized void awaitShutdown(long timeout, TimeUnit timeUnit) { + try { + if (STOP_COUNT.get() == 0) { + wait(3_000); + if (STOP_COUNT.get() == 0) { + throw new IllegalStateException("Never notified."); + } + } + } catch (InterruptedException e) { + throw new IllegalStateException(e); + } + } + } +} diff --git a/src/test/java/com/linkedin/xinfra/monitor/consumer/NewConsumerTest.java b/src/test/java/com/linkedin/xinfra/monitor/consumer/NewConsumerTest.java new file mode 100644 index 00000000..da93a175 --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/consumer/NewConsumerTest.java @@ -0,0 +1,103 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.consumer; + +import com.linkedin.xinfra.monitor.common.ConsumerGroupCoordinatorUtils; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.internals.KafkaFutureImpl; +import org.apache.kafka.common.internals.Topic; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +@Test +public class NewConsumerTest { + private static final int NUM_OFFSETS_TOPIC_PARTITIONS = 5; + private static final String TARGET_CONSUMER_GROUP_ID = "target-group-id"; + + @BeforeMethod + public void beforeMethod() { + System.out.println("Running beforeMethod of " + this.getClass()); + } + + @AfterMethod + public void afterMethod() { + System.out.println("Finished running testConsumerGroupCoordinatorHashing() of " + this.getClass()); + } + + @SuppressWarnings("unchecked") + @Test + public void testConsumerGroupCoordinatorHashing() throws ExecutionException, InterruptedException { + Properties consumerProperties = new Properties(); + + AdminClient adminClient = Mockito.mock(AdminClient.class); + + /* + * Mock the behavior of AdminClient only. + */ + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME))) + .thenReturn(Mockito.mock(DescribeTopicsResult.class)); + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)).values()) + .thenReturn(Mockito.mock(Map.class)); + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)) + .values() + .get(Topic.GROUP_METADATA_TOPIC_NAME)).thenReturn(Mockito.mock(KafkaFutureImpl.class)); + + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)) + .values() + .get(Topic.GROUP_METADATA_TOPIC_NAME) + .get()).thenReturn(Mockito.mock(TopicDescription.class)); + + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)) + .values() + .get(Topic.GROUP_METADATA_TOPIC_NAME) + .get() + .partitions()).thenReturn(Mockito.mock(List.class)); + + Mockito.when(adminClient.describeTopics(Collections.singleton(Topic.GROUP_METADATA_TOPIC_NAME)) + .values() + .get(Topic.GROUP_METADATA_TOPIC_NAME) + .get() + .partitions() + .size()).thenReturn(NUM_OFFSETS_TOPIC_PARTITIONS); + + consumerProperties.put(ConsumerConfig.GROUP_ID_CONFIG, + NewConsumer.configureGroupId(TARGET_CONSUMER_GROUP_ID, adminClient)); + System.out.println("Consumer properties after configuration: " + consumerProperties); + Assert.assertNotNull(consumerProperties.get(ConsumerConfig.GROUP_ID_CONFIG)); + + // Testing I: run partitionsFor() on the result to make sure they are the same + int hashedResult = + ConsumerGroupCoordinatorUtils.partitionFor(consumerProperties.get(ConsumerConfig.GROUP_ID_CONFIG).toString(), + NUM_OFFSETS_TOPIC_PARTITIONS); + int hashedResult2 = + ConsumerGroupCoordinatorUtils.partitionFor(TARGET_CONSUMER_GROUP_ID, NUM_OFFSETS_TOPIC_PARTITIONS); + + Assert.assertEquals(hashedResult, hashedResult2); + System.out.println("Modulo result as an absolute value: " + hashedResult); + System.out.println("Modulo result as an absolute value: " + hashedResult2); + + // Testing II: Also test that the groupIds are different. + Assert.assertNotEquals(TARGET_CONSUMER_GROUP_ID, consumerProperties.get(ConsumerConfig.GROUP_ID_CONFIG)); + + } +} diff --git a/src/test/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceTest.java b/src/test/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceTest.java new file mode 100644 index 00000000..b3aec5c9 --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/services/ClusterTopicManipulationServiceTest.java @@ -0,0 +1,120 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.linkedin.xinfra.monitor.XinfraMonitorConstants; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.requests.DescribeLogDirsResponse; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +/** + * Testing methods for the Xinfra Monitor class of ClusterTopicManipulationService. + */ +@Test +public class ClusterTopicManipulationServiceTest { + + private static final String SERVICE_TEST_TOPIC = "xinfra-monitor-topic-manipulation-test-topic"; + + @BeforeMethod + private void startTest() { + System.out.println("Started " + this.getClass().getSimpleName().toLowerCase() + "."); + } + + @AfterMethod + private void finishTest() { + System.out.println("Finished " + this.getClass().getCanonicalName().toLowerCase() + "."); + } + + @Test(invocationCount = 2) + void serviceStartTest() throws JsonProcessingException { + ClusterTopicManipulationService clusterTopicManipulationService = + Mockito.mock(ClusterTopicManipulationService.class); + + Mockito.doCallRealMethod() + .when(clusterTopicManipulationService) + .processBroker(Mockito.anyMap(), Mockito.any(), Mockito.anyString()); + + Mockito.doCallRealMethod() + .when(clusterTopicManipulationService) + .setExpectedPartitionsCount(Mockito.anyInt()); + + Mockito.doCallRealMethod() + .when(clusterTopicManipulationService) + .expectedPartitionsCount(); + + List brokers = new ArrayList<>(); + for (int id = 1; id < 3; id++) { + brokers.add(new Node(id, "kafka-broker-host", 8000)); + } + + Map> logDirectoriesResponseMap1 = new HashMap<>(); + Map> logDirectoriesResponseMap2 = new HashMap<>(); + + Map>> brokerMapHashMap = new HashMap<>(); + brokerMapHashMap.putIfAbsent(brokers.get(0), logDirectoriesResponseMap1); + brokerMapHashMap.putIfAbsent(brokers.get(1), logDirectoriesResponseMap2); + + Map logDirInfoMap1 = new HashMap<>(); + Map logDirInfoMap2 = new HashMap<>(); + + logDirectoriesResponseMap1.put(brokers.get(0).id(), logDirInfoMap1); + logDirectoriesResponseMap2.put(brokers.get(1).id(), logDirInfoMap2); + + Map replicaInfos1 = new HashMap<>(); + Map replicaInfos2 = new HashMap<>(); + + for (int topicPartition = 0; topicPartition < 3; topicPartition++) { + replicaInfos1.put(new TopicPartition(SERVICE_TEST_TOPIC, topicPartition), + new DescribeLogDirsResponse.ReplicaInfo(235, 0, false)); + + replicaInfos2.put(new TopicPartition(SERVICE_TEST_TOPIC, topicPartition), + new DescribeLogDirsResponse.ReplicaInfo(235, 0, false)); + } + + int totalPartitions = brokers.size() * replicaInfos1.size(); + System.out.println(totalPartitions); + clusterTopicManipulationService.setExpectedPartitionsCount(totalPartitions); + System.out.println(clusterTopicManipulationService.expectedPartitionsCount()); + + logDirInfoMap1.put(XinfraMonitorConstants.KAFKA_LOG_DIRECTORY + "-1", + new DescribeLogDirsResponse.LogDirInfo(null, replicaInfos1)); + logDirInfoMap2.put(XinfraMonitorConstants.KAFKA_LOG_DIRECTORY + "-2", + new DescribeLogDirsResponse.LogDirInfo(null, replicaInfos2)); + + ObjectMapper objectMapper = new ObjectMapper(); + ObjectWriter objectWriter = objectMapper.writerWithDefaultPrettyPrinter(); + + for (Map.Entry>> nodeMapEntry : brokerMapHashMap.entrySet()) { + System.out.println(objectWriter.writeValueAsString(nodeMapEntry.getValue())); + } + + for (Node broker : brokers) { + clusterTopicManipulationService.processBroker(brokerMapHashMap.get(broker), broker, SERVICE_TEST_TOPIC); + } + + Assert.assertEquals(totalPartitions, clusterTopicManipulationService.expectedPartitionsCount()); + System.out.println(); + } +} + diff --git a/src/test/java/com/linkedin/xinfra/monitor/services/ConsumeServiceTest.java b/src/test/java/com/linkedin/xinfra/monitor/services/ConsumeServiceTest.java new file mode 100644 index 00000000..5cb9282c --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/services/ConsumeServiceTest.java @@ -0,0 +1,242 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.common.Utils; +import com.linkedin.xinfra.monitor.consumer.BaseConsumerRecord; +import com.linkedin.xinfra.monitor.consumer.KMBaseConsumer; +import com.linkedin.xinfra.monitor.services.metrics.CommitLatencyMetrics; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.metrics.Metrics; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +/** + * This public class is a Unit Testing class for the Consume Service Class. + * Also tests for Kafka Monitor Consumer offset commits. + */ +public class ConsumeServiceTest { + private static final String TOPIC = "kafka-monitor-topic-testing"; + private static final Logger LOG = LoggerFactory.getLogger(ConsumeServiceTest.class); + private static final String TAGS_NAME = "name"; + private static final String METRIC_GROUP_NAME = "commit-availability-service"; + /* thread start delay in seconds */ + private static final long THREAD_START_DELAY_SECONDS = 4; + private static final String TAG_NAME_VALUE = "name"; + private static final long MOCK_LAST_COMMITTED_OFFSET = System.currentTimeMillis(); + private static final int PARTITION = 2; + private static final long FIRST_OFFSET = 2; + private static final long SECOND_OFFSET = 3; + private static Map tags; + + @Test + public void lifecycleTest() throws Exception { + ConsumeService consumeService = consumeService(); + + /* Nothing should be started */ + Assert.assertFalse(consumeService.isRunning()); + Assert.assertNotNull(consumeService.getServiceName()); + + /* Should accept but ignore start because start has not been called */ + consumeService.stop(); + Assert.assertFalse(consumeService.isRunning()); + + /* Should start */ + consumeService.startConsumeThreadForTesting(); + Assert.assertTrue(consumeService.isRunning()); + + shutdownConsumeService(consumeService); + } + + @Test + public void commitAvailabilityTest() throws Exception { + ConsumeService consumeService = consumeService(); + Metrics metrics = consumeServiceMetrics(consumeService); + + Assert.assertNotNull(metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue()); + Assert.assertEquals(metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue(), 0.0); + + /* Should start */ + consumeService.startConsumeThreadForTesting(); + Assert.assertTrue(consumeService.isRunning()); + + /* in milliseconds */ + long threadStartDelay = TimeUnit.SECONDS.toMillis(THREAD_START_DELAY_SECONDS); + + /* Thread.sleep safe to do here instead of ScheduledExecutorService + * We want to sleep current thread so that consumeService can start running for enough seconds. */ + Thread.sleep(threadStartDelay); + Assert.assertNotNull(metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue()); + Assert.assertNotNull(metrics.metrics().get(metrics.metricName("failed-commit-offsets-total", METRIC_GROUP_NAME, + tags)).metricValue()); + Assert.assertEquals(metrics.metrics().get(metrics.metricName("failed-commit-offsets-total", METRIC_GROUP_NAME, tags)).metricValue(), 0.0); + Assert.assertNotEquals(metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue(), 0.0); + shutdownConsumeService(consumeService); + } + + @Test + public void commitLatencyTest() throws Exception { + CommitLatencyMetrics commitLatencyMetrics = Mockito.mock(CommitLatencyMetrics.class); + Assert.assertNotNull(commitLatencyMetrics); + + ConsumeService consumeService = consumeService(); + Metrics metrics = consumeServiceMetrics(consumeService); + + Assert.assertNull(metrics.metrics().get(metrics.metricName("commit-offset-latency-ms-avg", METRIC_GROUP_NAME, tags))); + Assert.assertNull(metrics.metrics().get(metrics.metricName("commit-offset-latency-ms-max", METRIC_GROUP_NAME, tags))); + + /* Should start */ + consumeService.startConsumeThreadForTesting(); + Assert.assertTrue(consumeService.isRunning()); + + /* in milliseconds */ + long threadStartDelay = TimeUnit.SECONDS.toMillis(THREAD_START_DELAY_SECONDS); + + /* Thread.sleep safe to do here instead of ScheduledExecutorService + * We want to sleep current thread so that consumeService can start running for enough seconds. */ + Thread.sleep(threadStartDelay); + + shutdownConsumeService(consumeService); + } + + /** + * Sample ConsumeService instance for unit testing + * @return Sample ConsumeService object. + * @throws Exception should the ConsumeService creation fail or throws an error / exception + */ + private ConsumeService consumeService() throws Exception { + LOG.info("Creating an instance of Consume Service for testing.."); + + ConsumerFactory consumerFactory = Mockito.mock(ConsumerFactory.class); + AdminClient adminClient = Mockito.mock(AdminClient.class); + KMBaseConsumer kmBaseConsumer = Mockito.mock(KMBaseConsumer.class); + + Mockito.when(consumerFactory.adminClient()).thenReturn(adminClient); + Mockito.when(consumerFactory.latencySlaMs()).thenReturn(20000); + Mockito.when(consumerFactory.baseConsumer()).thenReturn(kmBaseConsumer); + Mockito.when(consumerFactory.topic()).thenReturn(TOPIC); + + /* LATENCY_PERCENTILE_MAX_MS_CONFIG, */ + Mockito.when(consumerFactory.latencyPercentileMaxMs()).thenReturn(5000); + + /* LATENCY_PERCENTILE_GRANULARITY_MS_CONFIG */ + Mockito.when(consumerFactory.latencyPercentileGranularityMs()).thenReturn(1); + + /* define return value */ + Mockito.when(kmBaseConsumer.lastCommitted()).thenReturn(MOCK_LAST_COMMITTED_OFFSET); + Mockito.when(kmBaseConsumer.committed(Mockito.any())).thenReturn(new OffsetAndMetadata(FIRST_OFFSET)); + Mockito.doAnswer(new Answer() { + @Override + public Void answer(InvocationOnMock invocationOnMock) { + OffsetCommitCallback callback = invocationOnMock.getArgument(0); + Map committedOffsets = new HashMap<>(); + committedOffsets.put(new TopicPartition(TOPIC, PARTITION), new OffsetAndMetadata(FIRST_OFFSET)); + callback.onComplete(committedOffsets, null); + + return null; + } + }).when(kmBaseConsumer).commitAsync(Mockito.any(OffsetCommitCallback.class)); + + + /* avro record to KmBaseConsumer record */ + Mockito.when(kmBaseConsumer.receive()).thenReturn( + new BaseConsumerRecord(TOPIC, PARTITION, SECOND_OFFSET, "key", + Utils.jsonFromFields(TOPIC, 2, 6000, "producerId", 2))); + + CompletableFuture topicPartitionResult = new CompletableFuture<>(); + topicPartitionResult.complete(null); + + return new ConsumeService(TAG_NAME_VALUE, topicPartitionResult, consumerFactory); + } + + @Test + public void awaitShutdownOtherThread() throws Exception { + final ConsumeService consumeService = consumeService(); + final AtomicReference error = new AtomicReference<>(); + + Thread thread = new Thread("test awaitshutdown thread") { + @Override + public void run() { + try { + consumeService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + } catch (Throwable t) { + error.set(t); + } + } + }; + + thread.start(); + consumeService.startConsumeThreadForTesting(); + Thread.sleep(100); + + consumeService.stop(); + thread.join(5000); + + Assert.assertFalse(thread.isAlive()); + Assert.assertEquals(error.get(), null); + + } + + /** + * return consume service metrics. + * @param consumeService ConsumeService object + * @return consume service metrics + */ + private Metrics consumeServiceMetrics(ConsumeService consumeService) { + setup(); + Metrics metrics = consumeService.metrics(); + return metrics; + } + + /** + * set up the tags for the metrics + */ + @BeforeMethod + public void setup() { + tags = new HashMap<>(); + tags.put(TAGS_NAME, TAG_NAME_VALUE); + } + + /** + * shutdown the consume service. + * @param consumeService object of ConsumeService + */ + private void shutdownConsumeService(ConsumeService consumeService) { + /* + intentionally attempt stopping twice as such executions shouldn't throw any exceptions. + Should allow start to be called more than once + */ + consumeService.stop(); + consumeService.stop(); + Assert.assertFalse(consumeService.isRunning()); + + /* Should be allowed to shutdown more than once. */ + consumeService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + consumeService.awaitShutdown(Integer.MAX_VALUE, TimeUnit.MILLISECONDS); + Assert.assertFalse(consumeService.isRunning()); + } + +} diff --git a/src/test/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceTest.java b/src/test/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceTest.java new file mode 100644 index 00000000..3a7deccc --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementServiceTest.java @@ -0,0 +1,167 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.topicfactory.TopicFactory; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import kafka.admin.BrokerMetadata; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.CreateTopicsResult; +import org.apache.kafka.clients.admin.DescribeClusterResult; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.KafkaFuture; +import org.apache.kafka.common.Node; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; +import scala.Option; + + +/** + * Testing methods for the Xinfra Monitor class of MultiClusterTopicManagementService. + */ +@SuppressWarnings("unchecked") +@Test +public class MultiClusterTopicManagementServiceTest { + + private static final String SERVICE_TEST_TOPIC = "xinfra-monitor-Multi-Cluster-Topic-Management-Service-Test-topic"; + private static Set nodeSet; + private MultiClusterTopicManagementService.TopicManagementHelper _topicManagementHelper; + private CreateTopicsResult _createTopicsResult; + private Map> _kafkaFutureMap; + private KafkaFuture _kafkaFuture; + + @BeforeMethod + private void startTest() { + _createTopicsResult = Mockito.mock(CreateTopicsResult.class); + _kafkaFutureMap = Mockito.mock(Map.class); + _kafkaFuture = Mockito.mock(KafkaFuture.class); + + nodeSet = new LinkedHashSet<>(); + nodeSet.add(new Node(1, "host-1", 2132)); + nodeSet.add(new Node(2, "host-2", 2133)); + nodeSet.add(new Node(3, "host-3", 2134)); + nodeSet.add(new Node(4, "host-4", 2135)); + nodeSet.add(new Node(5, "host-5", 2136)); + nodeSet.add(new Node(6, "host-5", 2137)); + nodeSet.add(new Node(7, "host-5", 2138)); + nodeSet.add(new Node(8, "host-5", 2139)); + nodeSet.add(new Node(9, "host-5", 2140)); + nodeSet.add(new Node(10, "host-5", 2141)); + + _topicManagementHelper = Mockito.mock(MultiClusterTopicManagementService.TopicManagementHelper.class); + _topicManagementHelper._topic = SERVICE_TEST_TOPIC; + _topicManagementHelper._adminClient = Mockito.mock(AdminClient.class); + _topicManagementHelper._topicFactory = Mockito.mock(TopicFactory.class); + _topicManagementHelper._topicCreationEnabled = true; + _topicManagementHelper._topicAddPartitionEnabled = true; + _topicManagementHelper._topicReassignPartitionAndElectLeaderEnabled = true; + } + + @AfterMethod + private void finishTest() { + System.out.println("Finished " + this.getClass().getCanonicalName().toLowerCase() + "."); + } + + @Test(invocationCount = 2) + protected void maybeAddPartitionsTest() { + Set brokerMetadataSet = new LinkedHashSet<>(); + for (Node broker : nodeSet) { + brokerMetadataSet.add(new BrokerMetadata(broker.id(), Option.apply(broker.rack()))); + } + + int minPartitionNum = 14; + int partitionNum = 5; + int rf = 4; + + List> newPartitionAssignments = + MultiClusterTopicManagementService.TopicManagementHelper.newPartitionAssignments(minPartitionNum, partitionNum, brokerMetadataSet, rf); + Assert.assertNotNull(newPartitionAssignments); + + System.out.println(newPartitionAssignments); + Assert.assertEquals(newPartitionAssignments.size(), minPartitionNum - partitionNum); + Assert.assertEquals(newPartitionAssignments.get(0).size(), rf); + } + + @Test + protected void MultiClusterTopicManagementServiceTopicCreationTest() throws Exception { + + Mockito.doCallRealMethod().when(_topicManagementHelper).maybeCreateTopic(); + + Mockito.when(_topicManagementHelper._adminClient.describeCluster()) + .thenReturn(Mockito.mock(DescribeClusterResult.class)); + Mockito.when(_topicManagementHelper._adminClient.describeCluster().nodes()) + .thenReturn(Mockito.mock(KafkaFuture.class)); + Mockito.when(_topicManagementHelper._adminClient.describeCluster().nodes().get()).thenReturn(nodeSet); + + Mockito.when(_topicManagementHelper._adminClient.createTopics(Mockito.anyCollection())) + .thenReturn(_createTopicsResult); + Mockito.when(_topicManagementHelper._adminClient.createTopics(Mockito.anyCollection()).values()) + .thenReturn(_kafkaFutureMap); + Mockito.when( + _topicManagementHelper._adminClient.createTopics(Mockito.anyCollection()).values().get(SERVICE_TEST_TOPIC)) + .thenReturn(_kafkaFuture); + + Answer createKafkaTopicFutureAnswer = new Answer() { + /** + * @param invocation the invocation on the mocked TopicManagementHelper. + * @return NULL value. + * @throws Throwable the throwable to be thrown when Exception occurs. + */ + @Override + public Void answer(InvocationOnMock invocation) throws Throwable { + + Mockito.when(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC))) + .thenReturn(Mockito.mock(DescribeTopicsResult.class)); + Mockito.when( + _topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)).values()) + .thenReturn(Mockito.mock(Map.class)); + Mockito.when(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)) + .values() + .get(SERVICE_TEST_TOPIC)).thenReturn(Mockito.mock(KafkaFuture.class)); + Mockito.when(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)) + .values() + .get(SERVICE_TEST_TOPIC) + .get()).thenReturn(Mockito.mock(TopicDescription.class)); + Mockito.when(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)) + .values() + .get(SERVICE_TEST_TOPIC) + .get() + .name()).thenReturn(SERVICE_TEST_TOPIC); + return null; + } + }; + + Mockito.when(_topicManagementHelper._topicFactory.createTopicIfNotExist(Mockito.anyString(), Mockito.anyShort(), + Mockito.anyDouble(), Mockito.any(), Mockito.any())).thenAnswer(createKafkaTopicFutureAnswer); + + _topicManagementHelper.maybeCreateTopic(); + + Assert.assertNotNull(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)) + .values() + .get(SERVICE_TEST_TOPIC) + .get()); + Assert.assertEquals(_topicManagementHelper._adminClient.describeTopics(Collections.singleton(SERVICE_TEST_TOPIC)) + .values() + .get(SERVICE_TEST_TOPIC) + .get() + .name(), SERVICE_TEST_TOPIC); + } +} diff --git a/src/test/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceTest.java b/src/test/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceTest.java new file mode 100644 index 00000000..8b543774 --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/services/OffsetCommitServiceTest.java @@ -0,0 +1,38 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +@Test +public class OffsetCommitServiceTest { + + @BeforeMethod + private void startTest() { + System.out.println("Started " + this.getClass().getSimpleName().toLowerCase() + "."); + } + + @AfterMethod + private void finishTest() { + System.out.println("Finished " + this.getClass().getCanonicalName().toLowerCase() + "."); + } + + @Test(invocationCount = 2) + void serviceStartTest() { + + // TODO (@andrewchoi5): implement offset commit service test + + } +} + diff --git a/src/test/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceTest.java b/src/test/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceTest.java new file mode 100644 index 00000000..ca8d77b1 --- /dev/null +++ b/src/test/java/com/linkedin/xinfra/monitor/services/TopicManagementServiceTest.java @@ -0,0 +1,96 @@ +/** + * Copyright 2020 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +package com.linkedin.xinfra.monitor.services; + +import com.linkedin.xinfra.monitor.services.MultiClusterTopicManagementService.TopicManagementHelper; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartitionInfo; +import org.testng.Assert; +import org.testng.annotations.Test; + +@Test +public class TopicManagementServiceTest { + + private static final String TOPIC = "kmf-unit-test-topic"; + + private List brokers(int brokerCount) { + List brokers = new ArrayList<>(); + for (int i = 0; i < brokerCount; i++) { + brokers.add(new Node(i, "", -1)); + } + return brokers; + } + + private Node[] nodes(int brokerCount) { + Node[] nodes = new Node[brokerCount]; + for (int i = 0; i < brokerCount; i++) { + nodes[i] = new Node(i, "", -1); + } + return nodes; + } + + @Test + public void noDetection() { + List partitions = new ArrayList<>(); + Node[] node = nodes(2); + partitions.add(new TopicPartitionInfo(0, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(1, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(2, node[1], new ArrayList<>(Arrays.asList(node[1], node[0])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(3, node[1], new ArrayList<>(Arrays.asList(node[1], node[0])), new ArrayList<>())); + + Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(2))); + Assert.assertFalse(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(2))); + } + + @Test + public void detectLowTotalNumberOfPartitions() { + List partitions = new ArrayList<>(); + Node[] node = nodes(3); + partitions.add(new TopicPartitionInfo(0, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(1, node[1], new ArrayList<>(Arrays.asList(node[1], node[0])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(2, node[2], new ArrayList<>(Arrays.asList(node[2], node[0])), new ArrayList<>())); + Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); + Assert.assertFalse(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); + Assert.assertEquals(TopicManagementHelper.getReplicationFactor(partitions), 2); + } + + + @Test + public void detectBrokerWithoutLeader() { + List partitions = new ArrayList<>(); + Node[] node = nodes(3); + partitions.add(new TopicPartitionInfo(0, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(1, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(2, node[1], new ArrayList<>(Arrays.asList(node[1], node[0])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(3, node[1], new ArrayList<>(Arrays.asList(node[2], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(4, node[1], new ArrayList<>(Arrays.asList(node[2], node[0])), new ArrayList<>())); + + Assert.assertFalse(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); + Assert.assertTrue(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); + } + + @Test + public void detectBrokerWithoutPreferredLeader() { + List partitions = new ArrayList<>(); + Node[] node = nodes(3); + partitions.add(new TopicPartitionInfo(0, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(1, node[0], new ArrayList<>(Arrays.asList(node[0], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(2, node[1], new ArrayList<>(Arrays.asList(node[0], node[0])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(3, node[1], new ArrayList<>(Arrays.asList(node[2], node[1])), new ArrayList<>())); + partitions.add(new TopicPartitionInfo(4, node[1], new ArrayList<>(Arrays.asList(node[2], node[0])), new ArrayList<>())); + + Assert.assertTrue(TopicManagementHelper.someBrokerNotPreferredLeader(partitions, brokers(3))); + Assert.assertTrue(TopicManagementHelper.someBrokerNotElectedLeader(partitions, brokers(3))); + } +} diff --git a/webapp/index.html b/webapp/index.html index c27cfaa2..03c22fb8 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -1,5 +1,5 @@