WIP on kafka cluster

apache · koeninger · Nov 23, 2014 · Nov 23, 2014 · Nov 24, 2014 · Nov 24, 2014
commit 1d706257ac848d37caeaff0409bf60b080d66e48
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rdd.kafka
+
+import scala.util.control.NonFatal
+import java.util.Properties
+import kafka.api.{TopicMetadataRequest, TopicMetadataResponse}
+import kafka.consumer.{ConsumerConfig, SimpleConsumer}
+
+/**
+  * Convenience methods for interacting with a Kafka cluster.
+  * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">configuration parameters</a>.
+  *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
+  *   NOT zookeeper servers, specified in host1:port1,host2:port2 form
+  */
+class KafkaCluster(val kafkaParams: Map[String, String]) {
+  val brokers: Array[(String, Int)] =
+    kafkaParams.get("metadata.broker.list")
+      .orElse(kafkaParams.get("bootstrap.servers"))
+      .getOrElse(throw new Exception("Must specify metadata.broker.list or bootstrap.servers"))
+      .split(",").map { hp =>
+        val hpa = hp.split(":")
+        (hpa(0), hpa(1).toInt)
+      }
+
+  val config: ConsumerConfig = KafkaCluster.consumerConfig(kafkaParams)
+
+  def connect(host: String, port: Int): SimpleConsumer =
+    new SimpleConsumer(host, port, config.socketTimeoutMs, config.socketReceiveBufferBytes, config.clientId)
+
+  def connect(hostAndPort: (String, Int)): SimpleConsumer =
+    connect(hostAndPort._1, hostAndPort._2)
+
+  def connectLeader(topic: String, partition: Int): Option[SimpleConsumer] =
+    findLeader(topic, partition).map(connect)
+
+  def findLeader(topic: String, partition: Int): Option[(String, Int)] = {
+    brokers.foreach { hp =>
+      var consumer: SimpleConsumer = null
+      try {
+        consumer = connect(hp)
+        val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion, 0, config.clientId, Seq(topic))
+        val resp: TopicMetadataResponse = consumer.send(req)
+        resp.topicsMetadata.find(_.topic == topic).flatMap { t =>
+          t.partitionsMetadata.find(_.partitionId == partition)
+        }.foreach { partitionMeta =>
+          partitionMeta.leader.foreach { leader =>
+            return Some((leader.host, leader.port))
+          }
+        }
+      } catch {
+        case NonFatal(e) =>
+      } finally {
+        if (consumer != null) consumer.close()
+      }
+    }
+    None
+  }
+}
+
+object KafkaCluster {
+  /** Make a consumer config without requiring group.id or zookeeper.connect,
+    * since communicating with brokers also needs common settings such as timeout
+    */
+  def consumerConfig(kafkaParams: Map[String, String]): ConsumerConfig = {
+    val props = new Properties()
+    kafkaParams.foreach(param => props.put(param._1, param._2))
+    Seq("zookeeper.connect", "group.id").foreach { s =>
+      if (!props.contains(s))
+      props.setProperty(s, "")
+    }
+    new ConsumerConfig(props)
+  }
+}
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala
@@ -39,7 +39,7 @@ private[spark] case class KafkaRDDPartition(
   untilOffset: Long
 ) extends Partition
 
-/** A batch-oriented interface to Kafka.
+/** A batch-oriented interface for consuming from Kafka.
   * Each given Kafka topic/partition corresponds to an RDD partition.
   * Starting and ending offsets are specified in advance, so that you can control exactly-once semantics.
   * For an easy interface to Kafka-managed offsets, see {@link org.apache.spark.rdd.kafka.KafkaCluster}
@@ -74,34 +74,31 @@ class KafkaRDD[
   override def compute(thePart: Partition, context: TaskContext) = new NextIterator[R] {
     context.addTaskCompletionListener{ context => closeIfNeeded() }
 
+    val kc = new KafkaCluster(kafkaParams)
     val part = thePart.asInstanceOf[KafkaRDDPartition]
-    val props = new Properties()
-    kafkaParams.foreach(param => props.put(param._1, param._2))
-    val fetchSize = Option(props.getProperty("fetch.message.max.bytes")).map(_.toInt).getOrElse(1024*1024)
-    val leaderBackoff = Option(props.getProperty("refresh.leader.backoff.ms")).map(_.toLong).getOrElse(200L)
-    val consumerConfig = new ConsumerConfig(props)
     val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
-      .newInstance(consumerConfig.props)
+      .newInstance(kc.config.props)
       .asInstanceOf[Decoder[K]]
     val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
-      .newInstance(consumerConfig.props)
+      .newInstance(kc.config.props)
       .asInstanceOf[Decoder[V]]
-    val consumer: SimpleConsumer = ???
+    val consumer: SimpleConsumer = kc.connectLeader(part.topic, part.partition)
+      .getOrElse(throw new Exception(s"Couldn't connect to leader for topic ${part.topic} ${part.partition}"))
     var requestOffset = part.fromOffset
     var iter: Iterator[MessageAndOffset] = null
 
     override def getNext: R = {
       if (iter == null || !iter.hasNext) {
         val req = new FetchRequestBuilder().
-          addFetch(part.topic, part.partition, requestOffset, fetchSize).
+          addFetch(part.topic, part.partition, requestOffset, kc.config.fetchMessageMaxBytes).
           build()
         val resp = consumer.fetch(req)
         if (resp.hasError) {
           val err = resp.errorCode(part.topic, part.partition)
           if (err == ErrorMapping.LeaderNotAvailableCode ||
             err == ErrorMapping.NotLeaderForPartitionCode) {
-            log.error(s"Lost leader for topic ${part.topic} partition ${part.partition}, sleeping for ${leaderBackoff}ms")
-            Thread.sleep(leaderBackoff)
+            log.error(s"Lost leader for topic ${part.topic} partition ${part.partition}, sleeping for ${kc.config.refreshLeaderBackoffMs}ms")
+            Thread.sleep(kc.config.refreshLeaderBackoffMs)
           }
           // Let normal rdd retry sort out reconnect attempts
           throw ErrorMapping.exceptionFor(err)