Skip to content

Commit 2433308

Browse files
committed
add files
1 parent 574d388 commit 2433308

File tree

221 files changed

+9930
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

221 files changed

+9930
-0
lines changed

chapter1/build.xml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
<project name="HadoopCookbook" default="build" basedir=".">
2+
<description>
3+
simple example build file
4+
</description>
5+
<!-- set global properties for this build -->
6+
<property name="src" location="src" />
7+
<property name="build" location="build" />
8+
<property name="hadoop.home" location="/Users/srinath/playground/hadoop-book/hadoop-1.0.0" />
9+
10+
<target name="init">
11+
<!-- Create the time stamp -->
12+
<tstamp />
13+
<!-- Create the build directory structure used by compile -->
14+
<mkdir dir="${build}" />
15+
<mkdir dir="${build}/classes" />
16+
17+
</target>
18+
19+
<target name="compile" depends="init" description="compile the source ">
20+
<!-- Compile the java code from ${src} into ${build} -->
21+
<javac srcdir="${src}" destdir="${build}/classes" includeantruntime="false">
22+
<classpath>
23+
<fileset dir="${hadoop.home}/lib">
24+
<include name="**/*.jar" />
25+
</fileset>
26+
<fileset dir="${hadoop.home}">
27+
<include name="hadoop-core-*.jar" />
28+
</fileset>
29+
</classpath>
30+
</javac>
31+
</target>
32+
33+
<target name="build" depends="compile" description="generate the distribution">
34+
<!-- Build the jar file -->
35+
<jar jarfile="${build}/lib/hadoop-cookbook-chapter1.jar" basedir="${build}/classes" />
36+
</target>
37+
38+
<target name="clean" description="clean up">
39+
<!-- Delete the ${build} and ${dist} directory trees -->
40+
<delete dir="${build}" />
41+
</target>
42+
</project>
1.6 KB
Binary file not shown.
1.59 KB
Binary file not shown.
1.63 KB
Binary file not shown.
3.05 KB
Binary file not shown.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/**
2+
* Following sample is adopted from original wordcount sample from
3+
* http://wiki.apache.org/hadoop/WordCount.
4+
*/
5+
package chapter1;
6+
7+
import java.io.IOException;
8+
import java.util.StringTokenizer;
9+
10+
import org.apache.hadoop.conf.Configuration;
11+
import org.apache.hadoop.fs.Path;
12+
import org.apache.hadoop.io.IntWritable;
13+
import org.apache.hadoop.io.Text;
14+
import org.apache.hadoop.mapred.JobConf;
15+
import org.apache.hadoop.mapreduce.Job;
16+
import org.apache.hadoop.mapreduce.Mapper;
17+
import org.apache.hadoop.mapreduce.Reducer;
18+
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
19+
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
20+
import org.apache.hadoop.util.GenericOptionsParser;
21+
22+
23+
/**
24+
* <p>The word count sample counts the number of word occurrences within a set of input documents
25+
* using MapReduce. The code has three parts: mapper, reducer, and the main program.</p>
26+
* @author Srinath Perera ([email protected])
27+
*/
28+
public class WordCount {
29+
30+
/**
31+
* <p>
32+
* The mapper extends from the org.apache.hadoop.mapreduce.Mapper interface. When Hadoop runs,
33+
* it receives each new line in the input files as an input to the mapper. The ÒmapÓ function
34+
* tokenize the line, and for each token (word) emits (word,1) as the output. </p>
35+
*/
36+
public static class TokenizerMapper
37+
extends Mapper<Object, Text, Text, IntWritable>{
38+
39+
private final static IntWritable one = new IntWritable(1);
40+
private Text word = new Text();
41+
42+
public void map(Object key, Text value, Context context
43+
) throws IOException, InterruptedException {
44+
StringTokenizer itr = new StringTokenizer(value.toString());
45+
while (itr.hasMoreTokens()) {
46+
word.set(itr.nextToken());
47+
context.write(word, one);
48+
}
49+
}
50+
}
51+
52+
/**
53+
* <p>Reduce function receives all the values that has the same key as the input, and it output the key
54+
* and the number of occurrences of the key as the output.</p>
55+
*/
56+
public static class IntSumReducer
57+
extends Reducer<Text,IntWritable,Text,IntWritable> {
58+
private IntWritable result = new IntWritable();
59+
60+
public void reduce(Text key, Iterable<IntWritable> values,
61+
Context context
62+
) throws IOException, InterruptedException {
63+
int sum = 0;
64+
for (IntWritable val : values) {
65+
sum += val.get();
66+
}
67+
result.set(sum);
68+
context.write(key, result);
69+
}
70+
}
71+
72+
/**
73+
* <p> As input this program takes any text file. Create a folder called input in HDFS (or in local directory if you are running this locally)
74+
* <ol>
75+
* <li>Option1: You can compile the sample by ant from sample directory. To do this, you need to have Apache Ant installed in your system.
76+
* Otherwise, you can use the complied jar included with the source code. hange directory to HADOOP_HOME, and copy the hadoop-cookbook.jar to the HADOOP_HOME.
77+
* Then run the command > bin/hadoop jar hadoop-cookbook.jar chapter1.WordCount input output.</li>
78+
* <li>As an optional step, copy the ÒinputÓ directory to the top level of the IDE based project (eclipse project) that you created for samples. Now you can run
79+
* the WordCount class directly from your IDE passing Òinput outputÓ as arguments. This will run the sample same as before. Running MapReduce Jobs from IDE in this manner is very useful
80+
* for debugging your MapReduce Jobs. </li>
81+
* </ol>
82+
* @param args
83+
* @throws Exception
84+
*/
85+
public static void main(String[] args) throws Exception {
86+
JobConf conf = new JobConf();
87+
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
88+
if (otherArgs.length != 2) {
89+
System.err.println("Usage: wordcount <in> <out>");
90+
System.exit(2);
91+
}
92+
93+
Job job = new Job(conf, "word count");
94+
job.setJarByClass(WordCount.class);
95+
job.setMapperClass(TokenizerMapper.class);
96+
//Uncomment this to
97+
//job.setCombinerClass(IntSumReducer.class);
98+
job.setReducerClass(IntSumReducer.class);
99+
job.setOutputKeyClass(Text.class);
100+
job.setOutputValueClass(IntWritable.class);
101+
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
102+
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
103+
System.exit(job.waitForCompletion(true) ? 0 : 1);
104+
}
105+
}
106+
107+

chapter10/C10Samples.jar

3.13 KB
Binary file not shown.

chapter10/build.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<project name="C10Samples" default="compile" basedir=".">
2+
<property name="build" location="build" />
3+
<property environment="env"/>
4+
5+
<path id="hadoop-classpath">
6+
<fileset dir="${env.HADOOP_HOME}/lib">
7+
<include name="**/*.jar" />
8+
</fileset>
9+
<fileset dir="${env.HADOOP_HOME}">
10+
<include name="hadoop-*.jar" />
11+
</fileset>
12+
</path>
13+
14+
<target name="compile">
15+
<mkdir dir="${build}" />
16+
<javac includeantruntime="false" srcdir="src" destdir="${build}">
17+
<classpath refid="hadoop-classpath"/>
18+
</javac>
19+
<jar jarfile="C10Samples.jar" basedir="${build}"/>
20+
</target>
21+
22+
<target name="clean">
23+
<delete dir="${build}" />
24+
</target>
25+
</project>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
CREATE TABLE HDI(
2+
id INT, country STRING, hdi FLOAT, lifeex INT, mysch INT, eysch INT, gni INT
3+
)
4+
ROW FORMAT DELIMITED
5+
FIELDS TERMINATED BY ','
6+
STORED AS TEXTFILE;
7+
8+
LOAD DATA INPATH '${INPUT}' INTO TABLE HDI;
9+
10+
CREATE EXTERNAL TABLE output_countries(
11+
country STRING, gni INT
12+
)
13+
ROW FORMAT DELIMITED
14+
FIELDS TERMINATED BY ','
15+
STORED AS TEXTFILE
16+
LOCATION '${OUTPUT}/countries'
17+
;
18+
19+
INSERT OVERWRITE TABLE output_countries
20+
SELECT
21+
country, gni
22+
FROM
23+
HDI
24+
WHERE
25+
gni > 2000;
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
A = load '$INPUT' using PigStorage(',') AS (id:int, country:chararray, hdi:float, lifeex:int, mysch:int, eysch:int, gni:int);
2+
B = FILTER A BY gni > 2000;
3+
C = ORDER B BY gni;
4+
STORE C into '$OUTPUT';

0 commit comments

Comments
 (0)