Apriori on MapReduce

Apiroi算法在Hadoop MapReduce上的实现

输入格式：

一行为一个Bucket

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 63 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 43 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

输出格式：

<item1,item2,...itemK, frequency>

代码：

 package apriori;

 import java.io.IOException;

 import java.util.Iterator;

 import java.util.StringTokenizer;

 import java.util.List;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.Map;

 import java.util.HashMap;

 import java.io.*;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Mapper.Context;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 class AprioriPass1Mapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text number = new Text();

     //第一次pass的Mapper只要把每个item映射为1

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         for(int i = 0;i < ids.length;i++){

             context.write(new Text(ids[i]),one);

         }

     }

 }

 class AprioriReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

     private IntWritable result = new IntWritable();

     //所有Pass的job共用一个reducer，即统计一种itemset的个数，并筛选除大于s的

     public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{

         int sum = 0;

         int minSup = context.getConfiguration().getInt("minSup",5);

         for(IntWritable val : values){

             sum += val.get();

         }

         result.set(sum);

         if(sum > minSup){

             context.write(key,result);

         }

     }

 }

 class AprioriPassKMapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text item = new Text();

     private List< List<Integer> > prevItemsets = new ArrayList< List<Integer> >();

     private List< List<Integer> > candidateItemsets = new ArrayList< List<Integer> >();

     private Map<String,Boolean> candidateItemsetsMap = new HashMap<String,Boolean>();

     //第一个以后的pass使用该Mapper，在map函数执行前会执行setup来从k-1次pass的输出中构建候选itemsets,对应于apriori算法

     @Override

     public void setup(Context context) throws IOException, InterruptedException{

         int passNum = context.getConfiguration().getInt("passNum",2);

         String prefix = context.getConfiguration().get("hdfsOutputDirPrefix","");

         String lastPass1 = context.getConfiguration().get("fs.default.name") + "/user/hadoop/chess-" + (passNum - 1) + "/part-r-00000";

         String lastPass = context.getConfiguration().get("fs.default.name") + prefix + (passNum - 1) + "/part-r-00000";

         try{

             Path path = new Path(lastPass);

             FileSystem fs = FileSystem.get(context.getConfiguration());

             BufferedReader fis = new BufferedReader(new InputStreamReader(fs.open(path)));

             String line = null;

             while((line = fis.readLine()) != null){

                 List<Integer> itemset = new ArrayList<Integer>();

                 String itemsStr = line.split("[\\s\\t]+")[0];

                 for(String itemStr : itemsStr.split(",")){

                     itemset.add(Integer.parseInt(itemStr));

                 }

                 prevItemsets.add(itemset);

             }

         }catch (Exception e){

             e.printStackTrace();

         }

         //get candidate itemsets from the prev itemsets

         candidateItemsets = getCandidateItemsets(prevItemsets,passNum - 1);

     }

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         List<Integer> itemset = new ArrayList<Integer>();

         for(String id : ids){

             itemset.add(Integer.parseInt(id));

         }

         //遍历所有候选集合

         for(List<Integer> candidateItemset : candidateItemsets){

             //如果输入的一行中包含该候选集合，则映射1，这样来统计候选集合被包括的次数

             //子集合，消耗掉了大部分时间

             if(contains(candidateItemset,itemset)){

                 String outputKey = "";

                 for(int i = 0;i < candidateItemset.size();i++){

                     outputKey += candidateItemset.get(i) + ",";

                 }

                 outputKey = outputKey.substring(0,outputKey.length() - 1);

                 context.write(new Text(outputKey),one);

             }

         }

     }

     //返回items是否是allItems的子集

     private boolean contains(List<Integer> items,List<Integer> allItems){

         int i = 0;

         int j = 0;

         while(i < items.size() && j < allItems.size()){

             if(allItems.get(j) > items.get(i)){

                 return false;

             }else if(allItems.get(j) == items.get(i)){

                 j++;

                 i++;

             }else{

                 j++;

             }

         }

         if(i != items.size()){

             return false;

         }

         return true;

     }

     //获取所有候选集合，参考apriori算法

     private List< List<Integer> > getCandidateItemsets(List< List<Integer> > prevItemsets, int passNum){

         List< List<Integer> > candidateItemsets = new ArrayList<List<Integer> >();

         //上次pass的输出中选取连个itemset构造大小为k + 1的候选集合

         for(int i = 0;i < prevItemsets.size();i++){

             for(int j = i + 1;j < prevItemsets.size();j++){

                 List<Integer> outerItems = prevItemsets.get(i);

                 List<Integer> innerItems = prevItemsets.get(j);

                 List<Integer> newItems = null;

                 if(passNum == 1){

                     newItems = new ArrayList<Integer>();

                     newItems.add(outerItems.get(0));

                     newItems.add(innerItems.get(0));

                 }

                 else{

                     int nDifferent = 0;

                     int index = -1;

                     for(int k = 0; k < passNum && nDifferent < 2;k++){

                         if(!innerItems.contains(outerItems.get(k))){

                             nDifferent++;

                             index = k;

                         }

                     }

                     if(nDifferent == 1){

                         //System.out.println("inner " + innerItems + " outer : " + outerItems);

                         newItems = new ArrayList<Integer>();

                         newItems.addAll(innerItems);

                         newItems.add(outerItems.get(index));

                     }

                 }

                 if(newItems == null){continue;}

                 Collections.sort(newItems);

                 //候选集合必须满足所有的子集都在上次pass的输出中，调用isCandidate进行检测，通过后加入到候选子集和列表

                 if(isCandidate(newItems,prevItemsets) && !candidateItemsets.contains(newItems)){

                     candidateItemsets.add(newItems);

                     //System.out.println(newItems);

                 }

             }

         }

         return candidateItemsets;

     }

     private boolean isCandidate(List<Integer> newItems,List< List<Integer> > prevItemsets){

         List<List<Integer>> subsets = getSubsets(newItems);     

         for(List<Integer> subset : subsets){

             if(!prevItemsets.contains(subset)){

                 return false;

             }

         }

         return true;

     }

     private List<List<Integer>> getSubsets(List<Integer> items){

         List<List<Integer>> subsets = new ArrayList<List<Integer>>();

         for(int i = 0;i < items.size();i++){

             List<Integer> subset = new ArrayList<Integer>(items);

             subset.remove(i);

             subsets.add(subset);

         }

         return subsets;

     }

 }

 public class Apriori extends Configured implements Tool{

     public static int s;

     public static int k;

     public int run(String[] args)throws IOException,InterruptedException,ClassNotFoundException{

         long startTime = System.currentTimeMillis();

         String hdfsInputDir = args[0];        //从参数1中读取输入数据

         String hdfsOutputDirPrefix = args[1];    //参数2为输出数据前缀，和第pass次组成输出目录

         s = Integer.parseInt(args[2]);        //阈值

         k = Integer.parseInt(args[3]);        //k次pass

         //循环执行K次pass

         for(int pass = 1; pass <= k;pass++){

             long passStartTime = System.currentTimeMillis();

             //配置执行该job

             if(!runPassKMRJob(hdfsInputDir,hdfsOutputDirPrefix,pass)){

                 return -1;

             }

             long passEndTime = System.currentTimeMillis();

             System.out.println("pass " + pass + " time : " + (passEndTime - passStartTime));

         }

         long endTime = System.currentTimeMillis();

         System.out.println("total time : " + (endTime - startTime));

         return 0;

     }

     private static boolean runPassKMRJob(String hdfsInputDir,String hdfsOutputDirPrefix,int passNum)

             throws IOException,InterruptedException,ClassNotFoundException{

             Configuration passNumMRConf = new Configuration();

             passNumMRConf.setInt("passNum",passNum);

             passNumMRConf.set("hdfsOutputDirPrefix",hdfsOutputDirPrefix);

             passNumMRConf.setInt("minSup",s);

             Job passNumMRJob = new Job(passNumMRConf,"" + passNum);

             passNumMRJob.setJarByClass(Apriori.class);

             if(passNum == 1){

                 //第一次pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPass1Mapper.class);

             }

             else{

                 //第一次之后的pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPassKMapper.class);

             }

             passNumMRJob.setReducerClass(AprioriReducer.class);

             passNumMRJob.setOutputKeyClass(Text.class);

             passNumMRJob.setOutputValueClass(IntWritable.class);

             FileInputFormat.addInputPath(passNumMRJob,new Path(hdfsInputDir));

             FileOutputFormat.setOutputPath(passNumMRJob,new Path(hdfsOutputDirPrefix + passNum));

             return passNumMRJob.waitForCompletion(true);

     }

     public static void main(String[] args) throws Exception{

         int exitCode = ToolRunner.run(new Apriori(),args);

         System.exit(exitCode);

     }

 }

Apriori on MapReduce的更多相关文章

记录近期小改Apriori至MapReduce上的心得
·背景前一阵,一直在研究一些ML的东东,后来工作关系暂停了一阵.现在继续把剩下一些热门的算法再吃吃透,"无聊+逗比"地把他们搞到MapReduce上.这次选择的入手对象为Apri ...
常见数据挖掘算法的Map-Reduce策略(2)
接着上一篇文章常见算法的mapreduce案例(1)继续挖坑,本文涉及到算法的基本原理,文中会大概讲讲,但具体有关公式的推导还请大家去查阅相关的文献文章.下面涉及到的数据挖掘算法会有:L ...
MapReduce实现Apriori算法
Apiroi算法在Hadoop MapReduce上的实现输入格式: 一行为一个Bucket 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 ...
#研发解决方案#基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
使用Apriori算法和FP-growth算法进行关联分析
系列文章:<机器学习实战>学习笔记最近看了<机器学习实战>中的第11章(使用Apriori算法进行关联分析)和第12章(使用FP-growth算法来高效发现频繁项集).正如章 ...
利用Apriori算法对交通路况的研究
首先简单描述一下Apriori算法:Apriori算法分为频繁项集的产生和规则的产生. Apriori算法频繁项集的产生: 令ck为候选k-项集的集合,而Fk为频繁k-项集的集合. 1.首先通过单遍扫 ...
基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案郑昀基于杨海波的设计文档（转）
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
基于Hadoop的改进Apriori算法
一.Apriori算法性质性质一: 候选的k元组集合Ck中,任意k-1个项组成的集合都来自于Lk. 性质二: 若k维数据项目集X={i1,i2,-,ik}中至少存在一个j∈X,使得|L(k-1)(j ...
海量数据挖掘MMDS week2: 频繁项集挖掘 Apriori算法的改进：非hash方法
http://blog.csdn.net/pipisorry/article/details/48914067 海量数据挖掘Mining Massive Datasets(MMDs) -Jure Le ...

随机推荐

俄罗斯画师Mikhail Rakhmatullin作品
quick3.5 removeFromParent()导致的windows下模拟器崩溃问题
今天遇到一个问题,点击一个按钮,这个按钮所在的layer从scene移除: local click = function ( event ) local StartScene=require(&quo ...
Top命令－转
Windows下的任务管理器虽然不好用(个人更喜欢Process Explorer些),但也算方便,可以方便的查看进程,CPU,内存...也可以很容易的结束进程没有图形化界面下的Linux,也有命令 ...
输入事件驱动---evdev_handler的大致实现流程（修整版）
一.input输入子系统框架下图是input输入子系统框架,输入子系统由输入子系统核心层(input core),驱动层和事件处理层(Event Handler)三部分组成.一个输入事件,比如滑动 ...
JQuery_事件基础
JavaScript 有一个非常重要的功能,就是事件驱动.当页面完全加载后,用户通过鼠标或键盘触发页面中绑定事件的元素即可触发. jQuery 为开发者更有效率的编写事件行为, 封装了大量有益的事件方 ...
[WPF]UserControl的MouseWheel事件触发
用户控件: <UserControl> <Grid> <TextBox x:Name="textBlock" HorizontalAlignment= ...
7 -- Spring的基本用法 -- 5...
7.5 Spring容器中的Bean 7.5.1 Bean的基本定义和Bean别名 <beans.../>元素是Spring配置文件的根元素,该元素可以指定如下属性: default-la ...
AxureRP8实战手册（基础31-40）
AxureRP8实战手册(基础31-40) 本文目录基础31. 切换元件库第2章页面设置基础32. 设置页面居中基础33. 设置页面背景(图片/颜色 ...
20145218 GDB调试汇编堆栈过程分析
GDB调试汇编堆栈过程分析虚拟机中分析过程输入gcc - g example.c -o example -m32指令在64位机器上产生32位汇编,但出现以下错误: 这时需要使用sudo apt-g ...
Java_ToolKit用法
转自:http://blog.sina.com.cn/s/blog_9e4556250100z5kv.html 此类是所有 Abstract Window Toolkit 实际实现的抽象超类.Tool ...

Apriori on MapReduce

输入格式：

输出格式：

代码：

Apriori on MapReduce的更多相关文章

随机推荐

热门专题