Hadoop之MapReduce

2020-09-25

MapReduce简介

MapReduce （分布式计算框架）是一种基于磁盘的分布式并行批处理计算模型，用于处理大数据量的计算。其中Map对应数据集上的独立元素进行指定的操作，生成键-值对形式中间，Reduce则对中间结果中相同的键的所有值进行规约，以得到最终结果。

Jobtracker：master节点，只有一个，管理所有作业，任务/作业的监控，错误处理等，将任务分解成一系列任务，并分派给Tasktracker。

Tacktracker：slave节点，运行 Map task和Reduce task；并与Jobtracker交互，汇报任务状态。

Map task：解析每条数据记录，传递给用户编写的map()函数并执行，将输出结果写入到本地磁盘（如果为map—only作业，则直接写入HDFS）。

Reduce task：从Map 它深刻地执行结果中，远程读取输入数据，对数据进行排序，将数据分组传递给用户编写的Reduce()函数执行。

例子

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.8.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.8.0</version>
</dependency>

package com.liujl.mapreduce.util;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/**
 * @Author: Administrator
 * @Description:
 * @Date: 2020/9/23 0023 16:18
 */
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     * 四个泛型类型分别代表：
     * KeyIn        Mapper的输入数据的Key，这里是每行文字的起始位置（0,11,...）
     * ValueIn      Mapper的输入数据的Value，这里是每行文字
     * KeyOut       Mapper的输出数据的Key，这里是每行文字中的“年份”
     * ValueOut     Mapper的输出数据的Value，这里是每行文字中的“气温”
     */
    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String year = line.substring(0, 4);
        int temperature = Integer.parseInt(line.substring(8));
        context.write(new Text(year), new IntWritable(temperature));
        // 打印样本: After Mapper:2000, 15
        System.out.println("======" + "After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
    }
}

package com.liujl.mapreduce.util;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

/**
 * @Author: Administrator
 * @Description:
 * @Date: 2020/9/23 0023 16:30
 */
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context) throws IOException, InterruptedException {
        int maxValue = Integer.MIN_VALUE;
        StringBuffer sb = new StringBuffer();
        //取values的最大值
        for (IntWritable value : values) {
            maxValue = Math.max(maxValue, value.get());
            sb.append(value).append(", ");
        }
        // 打印样本： Before Reduce: 2000, 15, 23, 99, 12, 22,
        System.out.print("Before Reduce: " + key + ", " + sb.toString());
        context.write(key, new IntWritable(maxValue));
        // 打印样本： After Reduce: 2000, 99
        System.out.println("======" + "After Reduce: " + key + ", " + maxValue);
    }
}

package com.liujl.mapreduce;

import com.liujl.mapreduce.util.MyMapper;
import com.liujl.mapreduce.util.MyReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.IOException;

@SpringBootTest
class MapreduceApplicationTests {

    @Test
    void contextLoads() throws IOException, ClassNotFoundException, InterruptedException {
        //输入路径
        String dst = "hdfs://localhost:9500/user/wcinput/mapreduce.txt";
        //输出路径，必须是不存在的，空文件夹也不行。

        Configuration hadoopConfig = new Configuration();
        System.setProperty("hadoop.home.dir", "E:\\hadoop-3.0.0");
        hadoopConfig.set("fs.hdfs.impl",
                org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
        );
        hadoopConfig.set("fs.file.impl",
                org.apache.hadoop.fs.LocalFileSystem.class.getName()
        );
        String tempPath="/"+System.currentTimeMillis()+"";
        String dstOut = "hdfs://localhost:9500/user/wcinput"+tempPath;
        Job job = new Job(hadoopConfig);
        //如果需要打成jar运行，需要下面这句
        //job.setJarByClass(NewMaxTemperature.class);
        //job执行作业时输入和输出文件的路径
        FileInputFormat.addInputPath(job, new Path(dst));
        FileOutputFormat.setOutputPath(job, new Path(dstOut));
        //指定自定义的Mapper和Reducer作为两个阶段的任务处理类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        //设置最后输出结果的Key和Value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //执行job，直到完成
        job.waitForCompletion(true);
        System.out.println("Finished");
    }


}

2014010114
2014010216
2014010317
2014010410
2014010506
2012010609
2012010732
2012010812
2012010919
2012011023
2001010116
2001010212
2001010310
2001010411
2001010529
2013010619
2013010722
2013010812
2013010929
2013011023
2008010105
2008010216
2008010337
2008010414
2008010516
2007010619
2007010712
2007010812
2007010999
2007011023
2010010114
2010010216
2010010317
2010010410
2010010506
2015010649
2015010722
2015010812
2015010999
2015011023