多表关联和单表关联类似,它也是通过对原始数据进行一定的处理,从其中挖掘出关心的信息。如下
输入的是两个文件,一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出工厂名-地址名表
样本如下:
factory:
factoryname addressedbeijing red star 1shenzhen thunder 3guangzhou honda 2beijing rising 1guangzhou development bank 2tencent 3back of beijing 1
address:
addressid addressname1 beijing2 guangzhou3 shenzhen4 xian
结果:
factoryname addressnamebeijing red star beijingbeijing rising beijingbank of beijing beijingguangzhou honda guangzhouguangzhou development bank guangzhoushenzhen thunder shenzhentencent shenzhen
代码如下:
import java.io.ioexception;import java.util.*;import org.apache.hadoop.conf.configuration;import org.apache.hadoop.fs.path;import org.apache.hadoop.io.intwritable;import org.apache.hadoop.io.text;import org.apache.hadoop.mapreduce.job;import org.apache.hadoop.mapreduce.mapper;import org.apache.hadoop.mapreduce.reducer;import org.apache.hadoop.mapreduce.lib.input.fileinputformat;import org.apache.hadoop.mapreduce.lib.output.fileoutputformat;import org.apache.hadoop.util.genericoptionsparser;public class mtjoin { public static int time = 0; /* * 在map中先区分输入行属于左表还是右表,然后对两列值进行分割, * 保存连接列在key值,剩余列和左右表标志在value中,最后输出 */ public static class map extends mapper { // 实现map函数 public void map(object key, text value, context context) throws ioexception, interruptedexception { string line = value.tostring();// 每行文件 string relationtype = new string();// 左右表标识 // 输入文件首行,不处理 if (line.contains(factoryname) == true || line.contains(addressed) == true) { return; } // 输入的一行预处理文本 stringtokenizer itr = new stringtokenizer(line); string mapkey = new string(); string mapvalue = new string(); int i = 0; while (itr.hasmoretokens()) { // 先读取一个单词 string token = itr.nexttoken(); // 判断该地址id就把存到values[0] if (token.charat(0) >= '0' && token.charat(0) 0) { relationtype = 1; } else { relationtype = 2; } continue; } // 存工厂名 mapvalue += token + ; i++; } // 输出左右表 context.write(new text(mapkey), new text(relationtype + ++ mapvalue)); } } /* * reduce解析map输出,将value中数据按照左右表分别保存,* 然后求出笛卡尔积,并输出。 */ public static class reduce extends reducer { // 实现reduce函数 public void reduce(text key, iterable values, context context) throws ioexception, interruptedexception { // 输出表头 if (0 == time) { context.write(new text(factoryname), new text(addressname)); time++; } int factorynum = 0; string[] factory = new string[10]; int addressnum = 0; string[] address = new string[10]; iterator ite = values.iterator(); while (ite.hasnext()) { string record = ite.next().tostring(); int len = record.length(); int i = 2; if (0 == len) { continue; } // 取得左右表标识 char relationtype = record.charat(0); // 左表 if ('1' == relationtype) { factory[factorynum] = record.substring(i); factorynum++; } // 右表 if ('2' == relationtype) { address[addressnum] = record.substring(i); addressnum++; } } // 求笛卡尔积 if (0 != factorynum && 0 != addressnum) { for (int m = 0; m javac -classpath hadoop-core-1.1.2.jar:/opt/hadoop-1.1.2/lib/commons-cli-1.2.jar -d firstproject firstproject/mtjoin.java
jar -cvf mtjoin.jar -c firstproject/ .
删除已经存在的output
hadoop fs -rmr output
hadoop fs -mkdir input
hadoop fs -put factory input
hadoop fs -put address input
运行
hadoop jar mtjoin.jar mtjoin input output
查看结果
hadoop fs -cat output/part-r-00000
?
作者:a331251021 发表于2013-8-4 16:20:52 原文链接
阅读:72 评论:0 查看评论
原文地址:hadoop实例---多表关联, 感谢原作者分享。
