hadoop执行wordcount程序、本地编写、放到hadoop集群上运行

在执行hadoop jar命令之前,必须先启动hadoop集群

1、首页简历maven工程,导入hadoop依赖

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.luximg</groupId>
	<artifactId>hadoop2</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<properties>
		<hadoopVersion>2.6.5</hadoopVersion>
	</properties>
	<dependencies>

		<!-- https://mvnrepository.com/artifact/junit/junit -->
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.10</version>
			<scope>test</scope>
		</dependency>

		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>${hadoopVersion}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>${hadoopVersion}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-mapreduce-client-core</artifactId>
			<version>${hadoopVersion}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoopVersion}</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.28</version>
		</dependency>

		<dependency>
			<groupId>org.vaadin.addons</groupId>
			<artifactId>dcharts-widget</artifactId>
			<version>0.10.0</version>
		</dependency>

		<dependency>
			<groupId>org.apache.storm</groupId>
			<artifactId>storm-core</artifactId>
			<version>0.9.5</version>
			<!--<scope>provided</scope> -->
		</dependency>
		<dependency>
			<groupId>org.apache.storm</groupId>
			<artifactId>storm-kafka</artifactId>
			<version>0.9.5</version>
			<exclusions>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-log4j12</artifactId>
				</exclusion>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-api</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<dependency>
			<groupId>org.clojure</groupId>
			<artifactId>clojure</artifactId>
			<version>1.5.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.kafka</groupId>
			<artifactId>kafka_2.8.2</artifactId>
			<version>0.8.1</version>
			<exclusions>
				<exclusion>
					<artifactId>jmxtools</artifactId>
					<groupId>com.sun.jdmk</groupId>
				</exclusion>
				<exclusion>
					<artifactId>jmxri</artifactId>
					<groupId>com.sun.jmx</groupId>
				</exclusion>
				<exclusion>
					<artifactId>jms</artifactId>
					<groupId>javax.jms</groupId>
				</exclusion>
				<exclusion>
					<groupId>org.apache.zookeeper</groupId>
					<artifactId>zookeeper</artifactId>
				</exclusion>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-log4j12</artifactId>
				</exclusion>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-api</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<dependency>
			<groupId>com.google.code.gson</groupId>
			<artifactId>gson</artifactId>
			<version>2.4</version>
		</dependency>
		<dependency>
			<groupId>redis.clients</groupId>
			<artifactId>jedis</artifactId>
			<version>2.7.3</version>
		</dependency>
		
		<!-- https://mvnrepository.com/artifact/com.oracle/ojdbc14 -->
		<!-- <dependency>
		    <groupId>com.oracle</groupId>
		    <artifactId>ojdbc14</artifactId>
		    <version>10.2.0.1.0</version>
		</dependency> -->

	</dependencies>
</project>

2、编写wordcount代码程序

public class WordcountDriver {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		
		//是否运行为本地模式,就是看这个参数值是否为local,默认就是local
		/*conf.set("mapreduce.framework.name", "local");*/
		
		//本地模式运行mr程序时,输入输出的数据可以在本地,也可以在hdfs上
		//到底在哪里,就看以下两行配置你用哪行,默认就是file:///
		conf.set("fs.defaultFS", "hdfs://192.168.124.140:9000/");
		/*conf.set("fs.defaultFS", "file:///");*/
		
		
		
		//运行集群模式,就是把程序提交到yarn中去运行
		//要想运行为集群模式,以下3个参数要指定为集群上的值
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resourcemanager.hostname", "mini1");
		conf.set("fs.defaultFS", "hdfs://mini1:9000/");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("c:/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(WordcountDriver.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		//指定最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//指定需要使用combiner,以及用哪个类作为combiner的逻辑
		/*job.setCombinerClass(WordcountCombiner.class);*/
		job.setCombinerClass(WordcountReducer.class);
		
		//如果不设置InputFormat,它默认用的是TextInputformat.class
		job.setInputFormatClass(CombineTextInputFormat.class);
		CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
		CombineTextInputFormat.setMinInputSplitSize(job, 2097152);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
		
	}

}

3、打成jar文件传到hadoop集群上 wc.jar 是你上传的jar文件 cn.luxing.mr.wcdemo.WordcountDriver 这个是类的全路径

/test/input这个是输入路径 /test/output 这个是输出路径(如果存在需要先删除) 

4、执行hadoop jar wc.jar cn.luxing.mr.wcdemo.WordcountDriver /test/input /test/output

5、查看是否执行成功

[[email protected] sbin]# hadoop fs -ls /test
drwxr-xr-x   - root supergroup          0 2019-03-11 00:24 /test/input
drwxr-xr-x   - root supergroup          0 2019-03-11 00:28 /test/output
[[email protected] sbin]# hadoop fs -ls /test/output
-rw-r--r--   1 root supergroup          0 2019-03-11 00:28 /test/output/_SUCCESS
-rw-r--r--   1 root supergroup      33184 2019-03-11 00:28 /test/output/part-r-00000
[[email protected] sbin]# 

hadoop执行wordcount程序、本地编写、放到hadoop集群上运行