mythstack
diff --git a/‎chapter1/build.xml‎
Lines changed: 42 additions & 0 deletions b/‎chapter1/build.xml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎chapter1/build/classes/chapter1/WordCount$IntSumReducer.class‎
1.6 KB b/‎chapter1/build/classes/chapter1/WordCount$IntSumReducer.class‎
1.6 KB
diff --git a/‎chapter1/build/classes/chapter1/WordCount$TokenizerMapper.class‎
1.59 KB b/‎chapter1/build/classes/chapter1/WordCount$TokenizerMapper.class‎
1.59 KB
diff --git a/‎chapter1/build/classes/chapter1/WordCount.class‎
1.63 KB b/‎chapter1/build/classes/chapter1/WordCount.class‎
1.63 KB
diff --git a/‎chapter1/build/lib/hadoop-cookbook-chapter1.jar‎
3.05 KB b/‎chapter1/build/lib/hadoop-cookbook-chapter1.jar‎
3.05 KB
diff --git a/‎chapter1/src/chapter1/WordCount.java‎
Lines changed: 107 additions & 0 deletions b/‎chapter1/src/chapter1/WordCount.java‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎chapter10/C10Samples.jar‎
3.13 KB b/‎chapter10/C10Samples.jar‎
3.13 KB
diff --git a/‎chapter10/build.xml‎
Lines changed: 25 additions & 0 deletions b/‎chapter10/build.xml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎chapter10/resources/countryFilter-EMR.hive‎
Lines changed: 25 additions & 0 deletions b/‎chapter10/resources/countryFilter-EMR.hive‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎chapter10/resources/countryFilter-EMR.pig‎
Lines changed: 4 additions & 0 deletions b/‎chapter10/resources/countryFilter-EMR.pig‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,42 @@
+<project name="HadoopCookbook" default="build" basedir=".">
+	<description>
+        simple example build file
+    </description>
+	<!-- set global properties for this build -->
+	<property name="src" location="src" />
+	<property name="build" location="build" />
+	<property name="hadoop.home" location="/Users/srinath/playground/hadoop-book/hadoop-1.0.0" />
+
+	<target name="init">
+		<!-- Create the time stamp -->
+		<tstamp />
+		<!-- Create the build directory structure used by compile -->
+		<mkdir dir="${build}" />
+		<mkdir dir="${build}/classes" />
+
+	</target>
+
+	<target name="compile" depends="init" description="compile the source ">
+		<!-- Compile the java code from ${src} into ${build} -->
+		<javac srcdir="${src}" destdir="${build}/classes" includeantruntime="false">
+			<classpath>
+				<fileset dir="${hadoop.home}/lib">
+					<include name="**/*.jar" />
+				</fileset>
+				<fileset dir="${hadoop.home}">
+					<include name="hadoop-core-*.jar" />
+				</fileset>
+			</classpath>
+		</javac>
+	</target>
+
+	<target name="build" depends="compile" description="generate the distribution">
+		<!-- Build the jar file -->
+		<jar jarfile="${build}/lib/hadoop-cookbook-chapter1.jar" basedir="${build}/classes" />
+	</target>
+
+	<target name="clean" description="clean up">
+		<!-- Delete the ${build} and ${dist} directory trees -->
+		<delete dir="${build}" />
+	</target>
+</project>
@@ -0,0 +1,107 @@
+/**
+ * Following sample is adopted from original wordcount sample from 
+ * http://wiki.apache.org/hadoop/WordCount. 
+ */
+package chapter1;
+
+import java.io.IOException;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+
+
+/**
+ * <p>The word count sample counts the number of word occurrences within a set of input documents 
+ * using MapReduce. The code has three parts: mapper, reducer, and the main program.</p>
+ * @author Srinath Perera ([email protected])
+ */
+public class WordCount {
+
+  /**
+   * <p>
+   * The mapper extends from the org.apache.hadoop.mapreduce.Mapper interface. When Hadoop runs, 
+   * it receives each new line in the input files as an input to the mapper. The ÒmapÓ function 
+   * tokenize the line, and for each token (word) emits (word,1) as the output.  </p>
+   */
+  public static class TokenizerMapper 
+       extends Mapper<Object, Text, Text, IntWritable>{
+    
+    private final static IntWritable one = new IntWritable(1);
+    private Text word = new Text();
+      
+    public void map(Object key, Text value, Context context
+                    ) throws IOException, InterruptedException {
+      StringTokenizer itr = new StringTokenizer(value.toString());
+      while (itr.hasMoreTokens()) {
+        word.set(itr.nextToken());
+        context.write(word, one);
+      }
+    }
+  }
+  
+  /**
+   * <p>Reduce function receives all the values that has the same key as the input, and it output the key 
+   * and the number of occurrences of the key as the output.</p>  
+   */
+  public static class IntSumReducer 
+       extends Reducer<Text,IntWritable,Text,IntWritable> {
+    private IntWritable result = new IntWritable();
+
+    public void reduce(Text key, Iterable<IntWritable> values, 
+                       Context context
+                       ) throws IOException, InterruptedException {
+      int sum = 0;
+      for (IntWritable val : values) {
+        sum += val.get();
+      }
+      result.set(sum);
+      context.write(key, result);
+    }
+  }
+
+  /**
+   * <p> As input this program takes any text file. Create a folder called input in HDFS (or in local directory if you are running this locally)
+   * <ol>
+   * <li>Option1: You can compile the sample by ant from sample directory.  To do this, you need to have Apache Ant installed in your system. 
+   * Otherwise, you can use the complied jar included with the source code. hange directory to HADOOP_HOME, and copy the hadoop-cookbook.jar to the HADOOP_HOME. 
+   * Then run the command > bin/hadoop jar hadoop-cookbook.jar chapter1.WordCount input output.</li> 
+   * <li>As an optional step, copy the ÒinputÓ directory to the top level of the IDE based project (eclipse project) that you created for samples. Now you can run 
+   * the WordCount class directly from your IDE passing Òinput outputÓ as arguments. This will run the sample same as before. Running MapReduce Jobs from IDE in this manner is very useful 
+   * for debugging your MapReduce Jobs. </li>
+   * </ol> 
+   * @param args
+   * @throws Exception
+   */
+  public static void main(String[] args) throws Exception {
+      JobConf conf = new JobConf();
+    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
+    if (otherArgs.length != 2) {
+      System.err.println("Usage: wordcount <in> <out>");
+      System.exit(2);
+    }
+        
+    Job job = new Job(conf, "word count");
+    job.setJarByClass(WordCount.class);
+    job.setMapperClass(TokenizerMapper.class);
+    //Uncomment this to 
+    //job.setCombinerClass(IntSumReducer.class);
+    job.setReducerClass(IntSumReducer.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(IntWritable.class);
+    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
+    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
+    System.exit(job.waitForCompletion(true) ? 0 : 1);
+  }
+}
+
+
@@ -0,0 +1,25 @@
+<project name="C10Samples" default="compile" basedir=".">
+	<property name="build" location="build" />
+	<property environment="env"/>
+	
+	<path id="hadoop-classpath">
+		<fileset dir="${env.HADOOP_HOME}/lib">
+			<include name="**/*.jar" />
+		</fileset>
+		<fileset dir="${env.HADOOP_HOME}">
+			<include name="hadoop-*.jar" />
+		</fileset>
+	</path>
+
+	<target name="compile">
+		<mkdir dir="${build}" />
+		<javac includeantruntime="false" srcdir="src" destdir="${build}">
+			<classpath refid="hadoop-classpath"/>
+		</javac>
+		<jar jarfile="C10Samples.jar" basedir="${build}"/>
+	</target>		
+
+	<target name="clean">
+		<delete dir="${build}" />
+	</target>
+</project>
@@ -0,0 +1,25 @@
+CREATE TABLE HDI(
+	id INT, country STRING, hdi FLOAT, lifeex INT, mysch INT, eysch INT, gni INT
+	) 
+	ROW FORMAT DELIMITED 
+	FIELDS TERMINATED BY ',' 
+	STORED AS TEXTFILE;
+
+LOAD DATA INPATH '${INPUT}' INTO TABLE HDI;
+
+CREATE EXTERNAL TABLE output_countries(
+    country STRING, gni INT
+    )    
+    ROW FORMAT DELIMITED
+    FIELDS TERMINATED BY ','    
+    STORED AS TEXTFILE
+    LOCATION '${OUTPUT}/countries'
+  ;
+
+INSERT OVERWRITE TABLE output_countries
+	SELECT 
+		country, gni 
+	FROM 
+		HDI 
+	WHERE 
+		gni > 2000;
@@ -0,0 +1,4 @@
+A = load '$INPUT' using PigStorage(',')  AS (id:int, country:chararray, hdi:float, lifeex:int, mysch:int, eysch:int, gni:int);
+B = FILTER A BY gni > 2000;
+C = ORDER B BY gni;
+STORE C into '$OUTPUT';