Apriori算法实现

Apriori算法原理：http://blog.csdn.net/kingzone_2008/article/details/8183768

import java.util.HashMap;

import java.util.HashSet;

import java.util.Iterator;

import java.util.Map;

import java.util.Set;

import java.util.TreeMap;

/**

* <B>关联规则挖掘：Apriori算法</B>

*

* <P>按照Apriori算法的基本思想来实现

*

* @author king

* @since 2013/06/27

*

*/

public class Apriori {

	private Map<Integer, Set<String>> txDatabase; // 事务数据库

	private Float minSup; // 最小支持度

	private Float minConf; // 最小置信度

	private Integer txDatabaseCount; // 事务数据库中的事务数

	private Map<Integer, Set<Set<String>>> freqItemSet; // 频繁项集集合

	private Map<Set<String>, Set<Set<String>>> assiciationRules; // 频繁关联规则集合

	public Apriori(

	    Map<Integer, Set<String>> txDatabase,

	    Float minSup,

	    Float minConf) {

	   this.txDatabase = txDatabase;

	   this.minSup = minSup;

	   this.minConf = minConf;

	   this.txDatabaseCount = this.txDatabase.size();

	   freqItemSet = new TreeMap<Integer, Set<Set<String>>>();

	   assiciationRules = new HashMap<Set<String>, Set<Set<String>>>();

	}

	/**

	* 扫描事务数据库，计算频繁1-项集

	* @return

	*/

	public Map<Set<String>, Float> getFreq1ItemSet() {

	   Map<Set<String>, Float> freq1ItemSetMap = new HashMap<Set<String>, Float>();

	   Map<Set<String>, Integer> candFreq1ItemSet = this.getCandFreq1ItemSet();

	   Iterator<Map.Entry<Set<String>, Integer>> it = candFreq1ItemSet.entrySet().iterator();

	   while(it.hasNext()) {

	    Map.Entry<Set<String>, Integer> entry = it.next();

	    // 计算支持度

	    Float supported = new Float(entry.getValue().toString())/new Float(txDatabaseCount);

	    if(supported>=minSup) {

	     freq1ItemSetMap.put(entry.getKey(), supported);

	    }

	   }

	   return freq1ItemSetMap;

	}

	/**

	* 计算候选频繁1-项集

	* @return

	*/

	public Map<Set<String>, Integer> getCandFreq1ItemSet() {

	   Map<Set<String>, Integer> candFreq1ItemSetMap = new HashMap<Set<String>, Integer>();

	   Iterator<Map.Entry<Integer, Set<String>>> it = txDatabase.entrySet().iterator();

	   // 统计支持数，生成候选频繁1-项集

	   while(it.hasNext()) {

	    Map.Entry<Integer, Set<String>> entry = it.next();

	    Set<String> itemSet = entry.getValue();

	    for(String item : itemSet) {

	     Set<String> key = new HashSet<String>();

	     key.add(item.trim());

	     if(!candFreq1ItemSetMap.containsKey(key)) {

	      Integer value = 1;

	      candFreq1ItemSetMap.put(key, value);

	     }

	     else {

	      Integer value = 1+candFreq1ItemSetMap.get(key);

	      candFreq1ItemSetMap.put(key, value);

	     }

	    }

	   }

	   return candFreq1ItemSetMap;

	}

	/**

	* 根据频繁(k-1)-项集计算候选频繁k-项集

	*

	* @param m 其中m=k-1

	* @param freqMItemSet 频繁(k-1)-项集

	* @return

	*/

	public Set<Set<String>> aprioriGen(int m, Set<Set<String>> freqMItemSet) {

	   Set<Set<String>> candFreqKItemSet = new HashSet<Set<String>>();

	   Iterator<Set<String>> it = freqMItemSet.iterator();

	   Set<String> originalItemSet = null;

	   while(it.hasNext()) {

	    originalItemSet = it.next();

	    Iterator<Set<String>> itr = this.getIterator(originalItemSet, freqMItemSet);

	    while(itr.hasNext()) {

	     Set<String> identicalSet = new HashSet<String>(); // 两个项集相同元素的集合(集合的交运算)

	     identicalSet.addAll(originalItemSet);

	     Set<String> set = itr.next();

	     identicalSet.retainAll(set); // identicalSet中剩下的元素是identicalSet与set集合中公有的元素

	     if(identicalSet.size() == m-1) { // (k-1)-项集中k-2个相同

	      Set<String> differentSet = new HashSet<String>(); // 两个项集不同元素的集合(集合的差运算)

	      differentSet.addAll(originalItemSet);

	      differentSet.removeAll(set); // 因为有k-2个相同，则differentSet中一定剩下一个元素，即differentSet大小为1

	      differentSet.addAll(set); // 构造候选k-项集的一个元素(set大小为k-1,differentSet大小为k)

	      if(!this.has_infrequent_subset(differentSet, freqMItemSet))

	          candFreqKItemSet.add(differentSet); // 加入候选k-项集集合

	     }

	    }

	   }

	   return candFreqKItemSet;

	}

	/**

	 * 使用先验知识，剪枝。若候选k项集中存在k-1项子集不是频繁k-1项集，则删除该候选k项集

	 * @param candKItemSet

	 * @param freqMItemSet

	 * @return

	 */

	private boolean has_infrequent_subset(Set<String> candKItemSet, Set<Set<String>> freqMItemSet) {

		Set<String> tempSet = new HashSet<String>();

		tempSet.addAll(candKItemSet);

		Iterator<String> itItem = candKItemSet.iterator();

		while(itItem.hasNext()) {

			String item = itItem.next();

			tempSet.remove(item);// 该候选去掉一项后变为k-1项集

			if(!freqMItemSet.contains(tempSet))// 判断k-1项集是否是频繁项集

				return true;

			tempSet.add(item);// 恢复

		}

		return false;

	}

	/**

	* 根据一个频繁k-项集的元素(集合)，获取到频繁k-项集的从该元素开始的迭代器实例

	* @param itemSet

	* @param freqKItemSet 频繁k-项集

	* @return

	*/

	private Iterator<Set<String>> getIterator(Set<String> itemSet, Set<Set<String>> freqKItemSet) {

	   Iterator<Set<String>> it = freqKItemSet.iterator();

	   while(it.hasNext()) {

	    if(itemSet.equals(it.next())) {

	     break;

	    }

	   }

	   return it;

	}

	/**

	* 根据频繁(k-1)-项集，调用aprioriGen方法，计算频繁k-项集

	*

	* @param k

	* @param freqMItemSet 频繁(k-1)-项集

	* @return

	*/

	public Map<Set<String>, Float> getFreqKItemSet(int k, Set<Set<String>> freqMItemSet) {

	   Map<Set<String>, Integer> candFreqKItemSetMap = new HashMap<Set<String>, Integer>();

	   // 调用aprioriGen方法，得到候选频繁k-项集

	   Set<Set<String>> candFreqKItemSet = this.aprioriGen(k-1, freqMItemSet);

	   // 扫描事务数据库

	   Iterator<Map.Entry<Integer, Set<String>>> it = txDatabase.entrySet().iterator();

	   // 统计支持数

	   while(it.hasNext()) {

	    Map.Entry<Integer, Set<String>> entry = it.next();

	    Iterator<Set<String>> kit = candFreqKItemSet.iterator();

	    while(kit.hasNext()) {

	     Set<String> kSet = kit.next();

	     Set<String> set = new HashSet<String>();

	     set.addAll(kSet);

	     set.removeAll(entry.getValue()); // 候选频繁k-项集与事务数据库中元素做差运算

	     if(set.isEmpty()) { // 如果拷贝set为空，支持数加1

	      if(candFreqKItemSetMap.get(kSet) == null) {

	       Integer value = 1;

	       candFreqKItemSetMap.put(kSet, value);

	      }

	      else {

	       Integer value = 1+candFreqKItemSetMap.get(kSet);

	       candFreqKItemSetMap.put(kSet, value);

	      }

	     }

	    }

	   }

	   // 计算支持度，生成频繁k-项集，并返回

	   return support(candFreqKItemSetMap);

	}

	/**

	* 根据候选频繁k-项集，得到频繁k-项集

	*

	* @param candFreqKItemSetMap 候选k项集(包含支持计数)

	* @return freqKItemSetMap 频繁k项集及其支持度(比例)

	*/

	public Map<Set<String>, Float> support(Map<Set<String>, Integer> candFreqKItemSetMap) {

	   Map<Set<String>, Float> freqKItemSetMap = new HashMap<Set<String>, Float>();

	   Iterator<Map.Entry<Set<String>, Integer>> it = candFreqKItemSetMap.entrySet().iterator();

	   while(it.hasNext()) {

	    Map.Entry<Set<String>, Integer> entry = it.next();

	    // 计算支持度

	    Float supportRate = new Float(entry.getValue().toString())/new Float(txDatabaseCount);

	    if(supportRate<minSup) { // 如果不满足最小支持度，删除

	     it.remove();

	    }

	    else {

	     freqKItemSetMap.put(entry.getKey(), supportRate);

	    }

	   }

	   return freqKItemSetMap;

	}

	/**

	* 挖掘全部频繁项集

	*/

	public void mineFreqItemSet() {

	   // 计算频繁1-项集

	   Set<Set<String>> freqKItemSet = this.getFreq1ItemSet().keySet();

	   freqItemSet.put(1, freqKItemSet);

	   // 计算频繁k-项集(k>1)

	   int k = 2;

	   while(true) {

	    Map<Set<String>, Float> freqKItemSetMap = this.getFreqKItemSet(k, freqKItemSet);

	    if(!freqKItemSetMap.isEmpty()) {

	     this.freqItemSet.put(k, freqKItemSetMap.keySet());

	     freqKItemSet = freqKItemSetMap.keySet();

	    }

	    else {

	     break;

	    }

	    k++;

	   }

	}

	/**

	* <P>挖掘频繁关联规则

	* <P>首先挖掘出全部的频繁项集，在此基础上挖掘频繁关联规则

	*/

	public void mineAssociationRules() {

	   freqItemSet.remove(1); // 删除频繁1-项集

	   Iterator<Map.Entry<Integer, Set<Set<String>>>> it = freqItemSet.entrySet().iterator();

	   while(it.hasNext()) {

	    Map.Entry<Integer, Set<Set<String>>> entry = it.next();

	    for(Set<String> itemSet : entry.getValue()) {

	     // 对每个频繁项集进行关联规则的挖掘

	     mine(itemSet);

	    }

	   }

	}

	/**

	* 对从频繁项集集合freqItemSet中每迭代出一个频繁项集元素，执行一次关联规则的挖掘

	* @param itemSet 频繁项集集合freqItemSet中的一个频繁项集元素

	*/

	public void mine(Set<String> itemSet) {

	   int n = itemSet.size()/2; // 根据集合的对称性，只需要得到一半的真子集

	   for(int i=1; i<=n; i++) {

	    // 得到频繁项集元素itemSet的作为条件的真子集集合

	    Set<Set<String>> properSubset = ProperSubsetCombination.getProperSubset(i, itemSet);

	    // 对条件的真子集集合中的每个条件项集，获取到对应的结论项集，从而进一步挖掘频繁关联规则

	    for(Set<String> conditionSet : properSubset) {

	     Set<String> conclusionSet = new HashSet<String>();

	     conclusionSet.addAll(itemSet);

	     conclusionSet.removeAll(conditionSet); // 删除条件中存在的频繁项

	     confide(conditionSet, conclusionSet); // 调用计算置信度的方法，并且挖掘出频繁关联规则

	    }

	   }

	}

	/**

	* 对得到的一个条件项集和对应的结论项集，计算该关联规则的支持计数，从而根据置信度判断是否是频繁关联规则

	* @param conditionSet 条件频繁项集

	* @param conclusionSet 结论频繁项集

	*/

	public void confide(Set<String> conditionSet, Set<String> conclusionSet) {

	   // 扫描事务数据库

	   Iterator<Map.Entry<Integer, Set<String>>> it = txDatabase.entrySet().iterator();

	   // 统计关联规则支持计数

	   int conditionToConclusionCnt = 0; // 关联规则(条件项集推出结论项集)计数

	   int conclusionToConditionCnt = 0; // 关联规则(结论项集推出条件项集)计数

	   int supCnt = 0; // 关联规则支持计数

	   while(it.hasNext()) {

	    Map.Entry<Integer, Set<String>> entry = it.next();

	    Set<String> txSet = entry.getValue();

	    Set<String> set1 = new HashSet<String>();

	    Set<String> set2 = new HashSet<String>();

	    set1.addAll(conditionSet);

	    set1.removeAll(txSet); // 集合差运算：set-txSet

	    if(set1.isEmpty()) { // 如果set为空，说明事务数据库中包含条件频繁项conditionSet

	     // 计数

	     conditionToConclusionCnt++;

	    }

	    set2.addAll(conclusionSet);

	    set2.removeAll(txSet); // 集合差运算：set-txSet

	    if(set2.isEmpty()) { // 如果set为空，说明事务数据库中包含结论频繁项conclusionSet

	     // 计数

	     conclusionToConditionCnt++;

	    }

	    if(set1.isEmpty() && set2.isEmpty()) {

	     supCnt++;

	    }

	   }

	   // 计算置信度

	   Float conditionToConclusionConf = new Float(supCnt)/new Float(conditionToConclusionCnt);

	   if(conditionToConclusionConf>=minConf) {

	    if(assiciationRules.get(conditionSet) == null) { // 如果不存在以该条件频繁项集为条件的关联规则

	     Set<Set<String>> conclusionSetSet = new HashSet<Set<String>>();

	     conclusionSetSet.add(conclusionSet);

	     assiciationRules.put(conditionSet, conclusionSetSet);

	    }

	    else {

	     assiciationRules.get(conditionSet).add(conclusionSet);

	    }

	   }

	   Float conclusionToConditionConf = new Float(supCnt)/new Float(conclusionToConditionCnt);

	   if(conclusionToConditionConf>=minConf) {

	    if(assiciationRules.get(conclusionSet) == null) { // 如果不存在以该结论频繁项集为条件的关联规则

	     Set<Set<String>> conclusionSetSet = new HashSet<Set<String>>();

	     conclusionSetSet.add(conditionSet);

	     assiciationRules.put(conclusionSet, conclusionSetSet);

	    }

	    else {

	     assiciationRules.get(conclusionSet).add(conditionSet);

	    }

	   }

	}

	/**

	* 经过挖掘得到的频繁项集Map

	*

	* @return 挖掘得到的频繁项集集合

	*/

	public Map<Integer, Set<Set<String>>> getFreqItemSet() {

	   return freqItemSet;

	}

	/**

	* 获取挖掘到的全部的频繁关联规则的集合

	* @return 频繁关联规则集合

	*/

	public Map<Set<String>, Set<Set<String>>> getAssiciationRules() {

	   return assiciationRules;

	}

}

测试类如下：

import java.io.BufferedReader;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.util.HashMap;

import java.util.HashSet;

import java.util.Map;

import java.util.Set;

import java.util.TreeSet;

import junit.framework.TestCase;

/**

* <B>Apriori算法测试类</B>

*

* @author king

* @date 2013/07/28

*/

public class AprioriTest extends TestCase {

	private Apriori apriori;

	private Map<Integer, Set<String>> txDatabase;

	private Float minSup = new Float("0.50");

	private Float minConf = new Float("0.70");

	public static void main(String []args) throws Exception {

		AprioriTest at = new AprioriTest();

		at.setUp();

		long from = System.currentTimeMillis();

		at.testGetFreqItemSet();

		long to = System.currentTimeMillis();

		System.out.println("耗时：" + (to-from));

	}

	@Override

	protected void setUp() throws Exception {

//	    create(); // 构造事务数据库

		this.buildData(Integer.MAX_VALUE, "f_faqk_.dat");

	    apriori = new Apriori(txDatabase, minSup, minConf);

	}

	/**

	* 构造模拟事务数据库txDatabase

	*/

	public void create() {

	   txDatabase = new HashMap<Integer, Set<String>>();

	   Set<String> set1 = new TreeSet<String>();

	   set1.add("A");

	   set1.add("B");

	   set1.add("C");

	   set1.add("E");

	   txDatabase.put(1, set1);

	   Set<String> set2 = new TreeSet<String>();

	   set2.add("A");

	   set2.add("B");

	   set2.add("C");

	   txDatabase.put(2, set2);

	   Set<String> set3 = new TreeSet<String>();

	   set3.add("C");

	   set3.add("D");

	   txDatabase.put(3, set3);

	   Set<String> set4 = new TreeSet<String>();

	   set4.add("A");

	   set4.add("B");

	   set4.add("E");

	   txDatabase.put(4, set4);

	}

	/**

	 * 构造数据集

	 * @param fileName 存储事务数据的文件名

	 * @param totalcount 获取的事务数

	 */

	public void buildData(int totalCount, String...fileName) {

		txDatabase = new HashMap<Integer, Set<String>>();

		if(fileName.length !=0){

			File file = new File(fileName[0]);

			int count = 0;

			try {

				BufferedReader reader = new BufferedReader(new FileReader(file));

				String line;

				while( (line = reader.readLine()) != null){

					String []arr = line.split(" ");

					Set<String> set = new HashSet<String>();

					for(String s : arr)

						set.add(s);

					count++;

					this.txDatabase.put(count, set);

					if(count >= totalCount) return;

				}

			} catch (FileNotFoundException e) {

				e.printStackTrace();

			} catch (IOException e) {

				e.printStackTrace();

			}

		}else{

		}

	}

	/**

	* 测试挖掘频繁1-项集

	*/

	public void testFreq1ItemSet() {

	   System.out.println("挖掘频繁1-项集 : " + apriori.getFreq1ItemSet());

	}

	/**

	* 测试aprioriGen方法，生成候选频繁项集

	*/

	public void testAprioriGen() {

	   System.out.println(

	     "候选频繁2-项集 ： " +

	     this.apriori.aprioriGen(1, this.apriori.getFreq1ItemSet().keySet())

	     );

	}

	/**

	* 测试挖掘频繁2-项集

	*/

	public void testGetFreq2ItemSet() {

	   System.out.println(

	     "挖掘频繁2-项集 ：" +

	     this.apriori.getFreqKItemSet(2, this.apriori.getFreq1ItemSet().keySet())

	     );

	}

	/**

	* 测试挖掘频繁3-项集

	*/

	public void testGetFreq3ItemSet() {

	   System.out.println(

	     "挖掘频繁3-项集 ：" +

	     this.apriori.getFreqKItemSet(

	       3,

	       this.apriori.getFreqKItemSet(2, this.apriori.getFreq1ItemSet().keySet()).keySet()

	       )

	     );

	}

	/**

	* 测试挖掘全部频繁项集

	*/

	public void testGetFreqItemSet() {

	   this.apriori.mineFreqItemSet(); // 挖掘频繁项集

	   System.out.println("挖掘频繁项集 ：" + this.apriori.getFreqItemSet());

	}

	/**

	* 测试挖掘全部频繁关联规则

	*/

	public void testMineAssociationRules() {

	   this.apriori.mineFreqItemSet(); // 挖掘频繁项集

	   this.apriori.mineAssociationRules();

	   System.out.println("挖掘频繁关联规则 ：" + this.apriori.getAssiciationRules());

	}

}

参考：
http://hi.baidu.com/shirdrn/item/5b74a313d55256711009b5d8

在此基础上添加了has_infrequent_subset方法，此方法使用先验知识进行剪枝，是典型Apriori算法必备的。

Apriori算法实现的更多相关文章

Apriori算法的原理与python 实现。
前言:这是一个老故事, 但每次看总是能从中想到点什么.在一家超市里,有一个有趣的现象:尿布和啤酒赫然摆在一起出售.但是这个奇怪的举措却使尿布和啤酒的销量双双增加了.这不是一个笑话,而是发生在美国沃尔玛 ...
#研发解决方案#基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
数据挖掘算法（四）Apriori算法
参考文献: 关联分析之Apriori算法
机器学习实战 - 读书笔记(11) - 使用Apriori算法进行关联分析
前言最近在看Peter Harrington写的"机器学习实战",这是我的学习心得,这次是第11章 - 使用Apriori算法进行关联分析. 基本概念关联分析(associat ...
关联规则挖掘之apriori算法
前言: 众所周知,关联规则挖掘是数据挖掘中重要的一部分,如著名的啤酒和尿布的问题.今天要学习的是经典的关联规则挖掘算法--Apriori算法一.算法的基本原理由k项频繁集去导出k+1项频繁集. 二 ...
利用Apriori算法对交通路况的研究
首先简单描述一下Apriori算法:Apriori算法分为频繁项集的产生和规则的产生. Apriori算法频繁项集的产生: 令ck为候选k-项集的集合,而Fk为频繁k-项集的集合. 1.首先通过单遍扫 ...
Apriori算法例子
1 Apriori介绍 Apriori算法使用频繁项集的先验知识,使用一种称作逐层搜索的迭代方法,k项集用于探索(k+1)项集.首先,通过扫描事务(交易)记录,找出所有的频繁1项集,该集合记做L1,然 ...
Apriori算法实例----Weka，R, Using Weka in my javacode
学习数据挖掘工具中,下面使用4种工具来对同一个数据集进行研究. 数据描述:下面这些数据是15个同学选修课程情况,在课程大纲中共有10门课程供学生选择,下面给出具体的选课情况,以ARFF数据文件保存,名 ...
Apriori算法在购物篮分析中的运用
购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法.下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析.例子使用Python+MongoDB 处理过程1 数据建模( ...
关于apriori算法的一个简单的例子
apriori算法是关联规则挖掘中很基础也很经典的一个算法,我认为很多教程出现大堆的公式不是很适合一个初学者理解.因此,本文列举一个简单的例子来演示下apriori算法的整个步骤. 下面这个表格是代表 ...

随机推荐

BZOJ 2730: [HNOI2012]矿场搭建( tarjan )
先tarjan求出割点.. 割点把图分成了几个双连通分量..只需dfs找出即可. 然后一个bcc有>2个割点, 那么这个bcc就不用建了, 因为一定可以走到其他救援出口. 只有一个割点的bcc就 ...
Ural 1149 - Sinus Dances
Let An = sin(1–sin(2+sin(3–sin(4+…sin(n))…)Let Sn = (…(A1+n)A2+n–1)A3+…+2)An+1For given N print SN I ...
练习 jquery+Ajax+Json 绑定数据分类： asp.net 练习 jquery+Ajax+Json 绑定数据分类： asp.net
练习 jquery+Ajax+Json 绑定数据
nginx区分手机与电脑浏览器并进入相应站点
本文要讲的的是如何使用nginx区分pc和手机访问不同的网站,是物理上完全隔离的两套网站(一套移动端.一套pc端),这样带来的好处pc端和移动端的内容可以不一样,移动版网站不需要包含特别多的内容,只 ...
编译原理Tiny语言的定义
Here is the definition for Tiny language The Tiny lexicon is as follows: Keywords: IF ELSE WRITE R ...
常用位操作，读8位 I2C 1302 18B20 .
/*1302*/ unsigned char DS1302OutputByte(void) //实时时钟读取一字节(内部函数) { unsigned char i; for(i=8; i>0; ...
一步一步重写 CodeIgniter 框架 (12) —— 代码再重构，回归 CI
第一课中搭建的基本的框架模型, 只有一个 index.php 作为执行文件,按这种方式最不稳定的因素就是路径的问题. 我们经常需要通过合适的参数,比如 load_class('output') 或 ...
datanode启动后,在web50070port发现不到datanode节点(能力工场)
直接上问题:这两天为了试验,安装了两套集群: (1)32位hadoop1集群(5个节点); (2)64位hadoop2集群(6个节点) 两个集群中都遇到过这种问题:在namenode正常启动hadoo ...
SignalR系列教程：在MVC5中使用SignalR
本章主要内容: 1:向MVC5添加SignaIr 2: 什么是集线器,如何创建集线器 3: 客户端通过jqery调用集线器本文还是延续“SignaIR快速入门”中聊天室的例子进行讲解.首先我们通过V ...
java组装json和提取一个json的例子
package jsonparsed; import net.sf.json.JSONException; import net.sf.json.JSONObject; import net.sf.j ...

Apriori算法实现

Apriori算法实现的更多相关文章

随机推荐

热门专题