hadoop搭建

http://hadoop.apache.org/docs/r1.0.4/cn/cluster_setup.html

 

两台虚拟机

server1    master   172.25.11.1

server2 slave  172.25.11.2

server1 对2做好免密ssh

useradd hadoop  -s /sbin/nologin
passwd hadoop

vim  /etc/sudoers

hadoop  ALL=(ALL) NOPASSWD:ALL

yum install rsync -y

yum install openssh-* -y

scp hadoop-3.0.3.tar.gz server1:/root/

tar zxf hadoop-3.0.3.tar.gz

tar zxf

vim /etc/profile

#JAVA_HOME
export JAVA_HOME=/usr/java/jdk1.8.0_171-amd64/
export JRE_HOME=/usr/java/jdk1.8.0_171-amd64/jre
export CLASSPATH=.:$JAVA_HOME/lib:$JRE_HOME/lib:$CLASSPATH
export PATH=$JAVA_HOME/bin:$JRE_HOME/bin:$JAVA_HOME:$PATH
#HADOOP_HOME
export HADOOP_HOME=/root/hadoop-3.0.3
export PATH=${PATH}:${HADOOP_HOME}/bin

 

source /etc/profile

 

[[email protected] hadoop-3.0.3]# hadoop version
Hadoop 3.0.3
Source code repository https://[email protected]/repos/asf/hadoop.git -r 37fd7d752db73d984dc31e0cdfd590d252f5e075
Compiled by yzhang on 2018-05-31T17:12Z
Compiled with protoc 2.5.0
From source with checksum 736cdcefa911261ad56d2d120bf1fa
This command was run using /root/hadoop-3.0.3/share/hadoop/common/hadoop-common-3.0.3.jar

[[email protected] hadoop-3.0.3]# mkdir -p /root/hadoop-3.0.3/hdfs/{data,name,tmp}

[[email protected] hadoop]# chmod 777 -R /root/hadoop-3.0.3/hdfs/

[[email protected] hadoop]# pwd
/root/hadoop-3.0.3/etc/hadoop
[[email protected] hadoop]# ls
capacity-scheduler.xml      hadoop-user-functions.sh.example  kms-log4j.properties        ssl-client.xml.example
configuration.xsl           hdfs-site.xml                     kms-site.xml                ssl-server.xml.example
container-executor.cfg      httpfs-env.sh                     log4j.properties            user_ec_policies.xml.template
core-site.xml               httpfs-log4j.properties           mapred-env.cmd              workers
hadoop-env.cmd              httpfs-signature.secret           mapred-env.sh               yarn-env.cmd
hadoop-env.sh               httpfs-site.xml                   mapred-queues.xml.template  yarn-env.sh
hadoop-metrics2.properties  kms-acls.xml                      mapred-site.xml             yarn-site.xml
hadoop-policy.xml           kms-env.sh                        shellprofile.d

现在你可以用以下三种支持的模式中的一种启动Hadoop集群:

  • 单机模式
  • 伪分布式模式
  • 完全分布式模式

 

hadoop3.0需要配置的文件有core-site.xml、hdfs-site.xml、yarn-site.xml、mapred-site.xml、hadoop-env.sh、workers

1.core-site.xml配置文件

<configuration>
  <property>
    <name>fs.defaultFS</name>
     <value>hdfs://server1:8020</value>
  </property>

  <property>
      <name>hadoop.tmp.dir</name>
      <value>file:///root/hadoop-3.0.3/hdfs/tmp/</value>
  </property>
</configuration>

 

2.hdfs-site.xml配置文件

 

 <property>
   <name>dfs.replication</name>
   <value>3</value>
 </property>
 <property>
   <name>dfs.name.dir</name>         
   <value>file:///root/hadoop-3.0.3/hdfs/name</value>
 </property>
 <property>
   <name>dfs.data.dir</name>         
   <value>file:///root/hadoop-3.0.3/hdfs/data</value>
 </property>

 

 

[[email protected] hadoop-3.0.3]# hdfs namenode -format

[[email protected] hadoop-3.0.3]# ll hdfs/name/current/
total 16
-rw-r--r-- 1 root root 389 Apr  3 23:57 fsimage_0000000000000000000
-rw-r--r-- 1 root root  62 Apr  3 23:57 fsimage_0000000000000000000.md5
-rw-r--r-- 1 root root   2 Apr  3 23:57 seen_txid
-rw-r--r-- 1 root root 215 Apr  3 23:57 VERSION

[[email protected] hadoop-3.0.3]# sbin/hadoop-daemon.sh start namenode

sbin/hadoop-daemon.sh start datanode

sbin/hadoop-daemon.sh start secondarynamenode

[[email protected] hadoop-3.0.3]# jps
3057 Jps
3026 SecondaryNameNode
2791 NameNode
2909 DataNode

 


 

HDFS上测试创建目录、上传、下载文件

创建目录

 

bin/hdfs dfs -mkdir /demo1

 上传  bin/hdfs dfs -put etc/hadoop/core-site.xml /demo1
  读取  bin/hdfs dfs -cat /demo1/core-site.xml
  下载  bin/hdfs dfs -get /demo1/core-site.xml

http://172.25.11.1:9870/dfshealth.html#tab-overview

hadoop搭建

 

 

 

 

3.workers中设置slave节点,将slave机器的名称写入

server2


4.mapred-site配置

  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
</property>
   <property>
    <name>yarn.app.mapreduce.am.env</name>
    <value>HADOOP_MAPRED_HOME=/root/hadoop-3.0.3</value>
    </property>
    <property>
      <name>mapreduce.map.env</name>
      <value>HADOOP_MAPRED_HOME=/root/hadoop-3.0.3</value>
    </property>
    <property>
      <name>mapreduce.reduce.env</name>
      <value>HADOOP_MAPRED_HOME=/root/hadoop-3.0.3</value>
    </property>

 

 

#######################下面的忽略

<property>
 <name>mapreduce.application.classpath</name>
 <value>
/root/hadoop-3.0.3/etc/hadoop,
/root/hadoop-3.0.3/share/hadoop/common/*,
/root/hadoop-3.0.3/share/hadoop/common/lib/*,
/root/hadoop-3.0.3/share/hadoop/common/hdfs/*,
/root/hadoop-3.0.3/share/hadoop/common/hdfs/lib/*,
/root/hadoop-3.0.3/share/hadoop/common/mapreduce/*,
/root/hadoop-3.0.3/share/hadoop/common/mapreduce/lib/*,
/root/hadoop-3.0.3/share/hadoop/common/yarn/*,
/root/hadoop-3.0.3/share/hadoop/common/yarn/lib/*
 </value>
</property>
</configuration>

#########################################

 

 

 

5.yarn-site.xml配置

<property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
</property> 
<property>
<name>yarn.resourcemanager.hostname</name>
<value>server1</value>
</property>

6  vim  /root/hadoop-3.0.3/etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr/java/jdk1.8.0_171-amd64/

 

以上配置完成后,将hadoop整个文件夹复制到server2

scp -rp

#######################忽略下面配置

<configuration>
  <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>  
    </property>  
    <property>  
        <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>  
        <value>org.apache.hadoop.mapred.ShuffleHandle</value>  
    </property>  
    <property>  
        <name>yarn.resourcemanager.resource-tracker.address</name>  
        <value>server1:8025</value>
    </property>  
    <property>  
        <name>yarn.resourcemanager.scheduler.address</name>  
        <value>server1:8030</value>
    </property>  
    <property>  
        <name>yarn.resourcemanager.address</name>
        <value>server1:8040</value>
    </property>
</configuration>

##########################################

 

启动Resourcemanager

sbin/yarn-daemon.sh start resourcemanager

启动nodemanager

sbin/yarn-daemon.sh start nodemanager

也可执行批处理文件启动服务
启动hdfs 和yarn
sbin/start-dfs.sh
sbin/start-yarn.sh

sbin/start-all.sh

http://172.25.11.1:8088/cluster

hadoop搭建

 

 

运行MapReduce Job

创建测试用的Input文件
bin/hdfs dfs -mkdir -p /wordcountdemo/input

[[email protected] hadoop-3.0.3]# vim hdfs/data/wc.input

hadoop mapreduce hive
hbase spark storm
sqoop hadoop hive
spark hadoop

[[email protected] hadoop-3.0.3]# bin/hdfs dfs -put hdfs/data/wc.input /wordcountdemo/input



 

 

[[email protected] ~]# hdfs namenode -format

[[email protected] ~]# /root/hadoop-3.0.3/sbin/start-dfs.sh

[[email protected] ~]# /root/hadoop-3.0.3/sbin/start-yarn.sh  

[[email protected] hadoop-3.0.3]# bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.0.3.jar wordcount /wordcountdemo/input /wordcountdemo/output
2019-04-04 00:13:32,357 INFO client.RMProxy: Connecting to ResourceManager at server1/172.25.11.1:8032
2019-04-04 00:13:34,005 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1554307791869_0001
2019-04-04 00:13:34,443 INFO input.FileInputFormat: Total input files to process : 1
2019-04-04 00:13:35,584 INFO mapreduce.JobSubmitter: number of splits:1
2019-04-04 00:13:35,665 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2019-04-04 00:13:36,470 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1554307791869_0001
2019-04-04 00:13:36,471 INFO mapreduce.JobSubmitter: Executing with tokens: []
2019-04-04 00:13:36,760 INFO conf.Configuration: resource-types.xml not found
2019-04-04 00:13:36,761 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2019-04-04 00:13:37,227 INFO impl.YarnClientImpl: Submitted application application_1554307791869_0001
2019-04-04 00:13:37,317 INFO mapreduce.Job: The url to track the job: http://server1:8088/proxy/application_1554307791869_0001/

[2019-04-04 00:14:23.058]Container killed on request. Exit code is 143
[2019-04-04 00:14:23.092]Container exited with a non-zero exit code 143.

2019-04-04 00:14:32,657 INFO mapreduce.Job:  map 100% reduce 0%
2019-04-04 00:14:41,785 INFO mapreduce.Job:  map 100% reduce 100%
2019-04-04 00:14:43,805 INFO mapreduce.Job: Job job_1554307791869_0001 completed successfully
2019-04-04 00:14:44,139 INFO mapreduce.Job: Counters: 55
    File System Counters
        FILE: Number of bytes read=94
        FILE: Number of bytes written=403929
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=184
        HDFS: Number of bytes written=60
        HDFS: Number of read operations=8
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters
        Failed map tasks=2
        Launched map tasks=3
        Launched reduce tasks=1
        Other local map tasks=2
        Data-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=25760
        Total time spent by all reduces in occupied slots (ms)=5821
        Total time spent by all map tasks (ms)=25760
        Total time spent by all reduce tasks (ms)=5821
        Total vcore-milliseconds taken by all map tasks=25760
        Total vcore-milliseconds taken by all reduce tasks=5821
        Total megabyte-milliseconds taken by all map tasks=26378240
        Total megabyte-milliseconds taken by all reduce tasks=5960704
    Map-Reduce Framework
        Map input records=4
        Map output records=11
        Map output bytes=115
        Map output materialized bytes=94
        Input split bytes=113
        Combine input records=11
        Combine output records=7
        Reduce input groups=7
        Reduce shuffle bytes=94
        Reduce input records=7
        Reduce output records=7
        Spilled Records=14
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=174
        CPU time spent (ms)=1280
        Physical memory (bytes) snapshot=317149184
        Virtual memory (bytes) snapshot=5472792576
        Total committed heap usage (bytes)=140574720
        Peak Map Physical memory (bytes)=202874880
        Peak Map Virtual memory (bytes)=2732937216
        Peak Reduce Physical memory (bytes)=114274304
        Peak Reduce Virtual memory (bytes)=2739855360
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters
        Bytes Read=71
    File Output Format Counters
        Bytes Written=60

 

统计结果

[[email protected] hadoop-3.0.3]# bin/hdfs dfs -cat /wordcountdemo/output/part-r-00000

[[email protected] hadoop-3.0.3]# bin/hdfs dfs -cat /wordcountdemo/output/part-r-00000
hadoop    3
hbase    1
hive    2
mapreduce    1
spark    2
sqoop    1
storm    1

停止Hadoop

sbin/hadoop-daemon.sh stop namenode
sbin/hadoop-daemon.sh stop datanode
sbin/yarn-daemon.sh stop resourcemanager
sbin/yarn-daemon.sh stop nodemanager

全部停止批处理文件
sbin/stop_yarn.sh
sbin/stop_dfs.sh

sbin/stop_all.sh

HDFS模块简介

HDFS负责大数据的存储,通过将大文件分块后进行分布式存储方式,突破了服务器硬盘大小的限制,解决了单台机器无法存储大文件的问题,HDFS是个相对独立的模块,可以为YARN提供服务,也可以为HBase等其他模块提供服务。

YARN模块简介

YARN是一个通用的资源协同和任务调度框架,是为了解决Hadoop1.x中MapReduce里NameNode负载太大和其他问题而创建的一个框架。

YARN是个通用框架,不止可以运行MapReduce,还可以运行Spark、Storm等其他计算框架。

MapReduce模块简介

MapReduce是一个计算框架,它给出了一种数据处理的方式,即通过Map阶段、Reduce阶段来分布式地流式处理数据。它只适用于大数据的离线处理,对实时性要求很高的应用不适用。

—-the—–end—-