数据挖掘:关联规则的apriori算法在weka的源码分析
相对于机器学习,关联规则的apriori算法更偏向于数据挖掘。
1) 测试文档中调用weka的关联规则apriori算法,如下
try {
File file = new File("F:\\tools/lib/data/contact-lenses.arff");
ArffLoader loader = new ArffLoader();
loader.setFile(file);
Instances m_instances = loader.getDataSet(); Discretize discretize = new Discretize();
discretize.setInputFormat(m_instances);
m_instances = Filter.useFilter(m_instances, discretize);
Apriori apriori = new Apriori();
apriori.buildAssociations(m_instances);
System.out.println(apriori.toString());
} catch (Exception e) {
e.printStackTrace();
}
步骤
1 读取数据集data,并提取样本集instances
2 离散化属性Discretize
3 创建Apriori 关联规则模型
4 输出大频率项集和关联规则集
2) 创建分类器的时候,调用设置默认参数方法
public void resetOptions() { m_removeMissingCols = false;
m_verbose = false;
m_delta = 0.05;
m_minMetric = 0.90;
m_numRules = ;
m_lowerBoundMinSupport = 0.1;
m_upperBoundMinSupport = 1.0;
m_significanceLevel = -;
m_outputItemSets = false;
m_car = false;
m_classIndex = -;
}
参数详细解析,见后面的备注1
3)buildAssociations方法的解析,源码如下
public void buildAssociations(Instances instances) throws Exception { double[] confidences, supports;
int[] indices;
FastVector[] sortedRuleSet;
int necSupport = ; instances = new Instances(instances); if (m_removeMissingCols) {
instances = removeMissingColumns(instances);
}
if (m_car && m_metricType != CONFIDENCE)
throw new Exception("For CAR-Mining metric type has to be confidence!"); // only set class index if CAR is requested
if (m_car) {
if (m_classIndex == -) {
instances.setClassIndex(instances.numAttributes() - );
} else if (m_classIndex <= instances.numAttributes() && m_classIndex > ) {
instances.setClassIndex(m_classIndex - );
} else {
throw new Exception("Invalid class index.");
}
} // can associator handle the data?
getCapabilities().testWithFail(instances); m_cycles = ; // make sure that the lower bound is equal to at least one instance
double lowerBoundMinSupportToUse = (m_lowerBoundMinSupport
* instances.numInstances() < 1.0) ? 1.0 / instances.numInstances()
: m_lowerBoundMinSupport; if (m_car) {
// m_instances does not contain the class attribute
m_instances = LabeledItemSet.divide(instances, false); // m_onlyClass contains only the class attribute
m_onlyClass = LabeledItemSet.divide(instances, true);
} else
m_instances = instances; if (m_car && m_numRules == Integer.MAX_VALUE) {
// Set desired minimum support
m_minSupport = lowerBoundMinSupportToUse;
} else {
// Decrease minimum support until desired number of rules found.
m_minSupport = m_upperBoundMinSupport - m_delta;
m_minSupport = (m_minSupport < lowerBoundMinSupportToUse) ? lowerBoundMinSupportToUse
: m_minSupport;
} do { // Reserve space for variables
m_Ls = new FastVector();
m_hashtables = new FastVector();
m_allTheRules = new FastVector[];
m_allTheRules[] = new FastVector();
m_allTheRules[] = new FastVector();
m_allTheRules[] = new FastVector();
if (m_metricType != CONFIDENCE || m_significanceLevel != -) {
m_allTheRules[] = new FastVector();
m_allTheRules[] = new FastVector();
m_allTheRules[] = new FastVector();
}
sortedRuleSet = new FastVector[];
sortedRuleSet[] = new FastVector();
sortedRuleSet[] = new FastVector();
sortedRuleSet[] = new FastVector();
if (m_metricType != CONFIDENCE || m_significanceLevel != -) {
sortedRuleSet[] = new FastVector();
sortedRuleSet[] = new FastVector();
sortedRuleSet[] = new FastVector();
}
if (!m_car) {
// Find large itemsets and rules
findLargeItemSets();
if (m_significanceLevel != - || m_metricType != CONFIDENCE)
findRulesBruteForce();
else
findRulesQuickly();
} else {
findLargeCarItemSets();
findCarRulesQuickly();
} // prune rules for upper bound min support
if (m_upperBoundMinSupport < 1.0) {
pruneRulesForUpperBoundSupport();
} int j = m_allTheRules[].size() - ;
supports = new double[m_allTheRules[].size()];
for (int i = ; i < (j + ); i++)
supports[j - i] = ((double) ((ItemSet) m_allTheRules[]
.elementAt(j - i)).support()) * (-);
indices = Utils.stableSort(supports);
for (int i = ; i < (j + ); i++) {
sortedRuleSet[].addElement(m_allTheRules[].elementAt(indices[j - i]));
sortedRuleSet[].addElement(m_allTheRules[].elementAt(indices[j - i]));
sortedRuleSet[].addElement(m_allTheRules[].elementAt(indices[j - i]));
if (m_metricType != CONFIDENCE || m_significanceLevel != -) {
sortedRuleSet[].addElement(m_allTheRules[]
.elementAt(indices[j - i]));
sortedRuleSet[].addElement(m_allTheRules[]
.elementAt(indices[j - i]));
sortedRuleSet[].addElement(m_allTheRules[]
.elementAt(indices[j - i]));
}
} // Sort rules according to their confidence
m_allTheRules[].removeAllElements();
m_allTheRules[].removeAllElements();
m_allTheRules[].removeAllElements();
if (m_metricType != CONFIDENCE || m_significanceLevel != -) {
m_allTheRules[].removeAllElements();
m_allTheRules[].removeAllElements();
m_allTheRules[].removeAllElements();
}
confidences = new double[sortedRuleSet[].size()];
int sortType = + m_metricType; for (int i = ; i < sortedRuleSet[].size(); i++)
confidences[i] = ((Double) sortedRuleSet[sortType].elementAt(i))
.doubleValue();
indices = Utils.stableSort(confidences);
for (int i = sortedRuleSet[].size() - ; (i >= (sortedRuleSet[].size() - m_numRules))
&& (i >= ); i--) {
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
if (m_metricType != CONFIDENCE || m_significanceLevel != -) {
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
m_allTheRules[].addElement(sortedRuleSet[].elementAt(indices[i]));
}
} if (m_verbose) {
if (m_Ls.size() > ) {
System.out.println(toString());
}
} if (m_minSupport == lowerBoundMinSupportToUse
|| m_minSupport - m_delta > lowerBoundMinSupportToUse)
m_minSupport -= m_delta;
else
m_minSupport = lowerBoundMinSupportToUse; necSupport = Math.round((float) ((m_minSupport * m_instances
.numInstances()) + 0.5)); m_cycles++;
} while ((m_allTheRules[].size() < m_numRules)
&& (Utils.grOrEq(m_minSupport, lowerBoundMinSupportToUse))
/* (necSupport >= lowerBoundNumInstancesSupport) */
/* (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport)) */&& (necSupport >= ));
m_minSupport += m_delta;
}
主要步骤解析:
1 使用removeMissingColumns方法,删除缺失属性的列
2 如果参数m_car是真,则进行划分;因为m_car是真的意思是挖掘与关联规则的有关的规则,所以划分成两部分,一部分有关,一部分无关,删除无关的即可;
3 方法findLargeItemSets查找大频率项集;具体源码见下面
4 方法findRulesQuickly查找所有的关联规则集;
5 方法pruneRulesForUpperBoundSupport删除不满足最小置信度的规则集;
6)按照置信度把规则集排序;
4)查找大频率项集findLargeItemSets源码如下:
private void findLargeItemSets() throws Exception { FastVector kMinusOneSets, kSets;
Hashtable hashtable;
int necSupport, necMaxSupport, i = ; // Find large itemsets // minimum support
necSupport = (int) (m_minSupport * m_instances.numInstances() + 0.5);
necMaxSupport = (int) (m_upperBoundMinSupport * m_instances.numInstances() + 0.5); kSets = AprioriItemSet.singletons(m_instances);
AprioriItemSet.upDateCounters(kSets, m_instances);
kSets = AprioriItemSet.deleteItemSets(kSets, necSupport,
m_instances.numInstances());
if (kSets.size() == )
return;
do {
m_Ls.addElement(kSets);
kMinusOneSets = kSets;
kSets = AprioriItemSet.mergeAllItemSets(kMinusOneSets, i,
m_instances.numInstances());
hashtable = AprioriItemSet.getHashtable(kMinusOneSets,
kMinusOneSets.size());
m_hashtables.addElement(hashtable);
kSets = AprioriItemSet.pruneItemSets(kSets, hashtable);
AprioriItemSet.upDateCounters(kSets, m_instances);
kSets = AprioriItemSet.deleteItemSets(kSets, necSupport,
m_instances.numInstances());
i++;
} while (kSets.size() > );
}
主要步骤:
1 类AprioriItemSet.singletons方法,将给定数据集的头信息转换成一个项集的集合, 头信息中的值的顺序是按字典序。
2 方法upDateCounters查找一频繁项目集;
3 AprioriItemSet.deleteItemSets方法,删除不满足支持度区间的项目集;
4 使用方法mergeAllItemSets(源码如下)由k-1项目集循环生出k频繁项目集,并且使用方法deleteItemSets删除不满足支持度区间的项目集;
5)由k-1项目集循环生出k频繁项目集的方法mergeAllItemSets,源码如下:
public static FastVector mergeAllItemSets(FastVector itemSets, int size,
int totalTrans) { FastVector newVector = new FastVector();
ItemSet result;
int numFound, k; for (int i = ; i < itemSets.size(); i++) {
ItemSet first = (ItemSet) itemSets.elementAt(i);
out: for (int j = i + ; j < itemSets.size(); j++) {
ItemSet second = (ItemSet) itemSets.elementAt(j);
result = new AprioriItemSet(totalTrans);
result.m_items = new int[first.m_items.length]; // Find and copy common prefix of size 'size'
numFound = ;
k = ;
while (numFound < size) {
if (first.m_items[k] == second.m_items[k]) {
if (first.m_items[k] != -)
numFound++;
result.m_items[k] = first.m_items[k];
} else
break out;
k++;
} // Check difference
while (k < first.m_items.length) {
if ((first.m_items[k] != -) && (second.m_items[k] != -))
break;
else {
if (first.m_items[k] != -)
result.m_items[k] = first.m_items[k];
else
result.m_items[k] = second.m_items[k];
}
k++;
}
if (k == first.m_items.length) {
result.m_counter = ;
newVector.addElement(result);
}
}
}
return newVector;
}
调用方法generateRules生出关联规则
6)生出关联规则的方法generateRules,源码如下
public FastVector[] generateRules(double minConfidence,
FastVector hashtables, int numItemsInSet) { FastVector premises = new FastVector(), consequences = new FastVector(), conf = new FastVector();
FastVector[] rules = new FastVector[], moreResults;
AprioriItemSet premise, consequence;
Hashtable hashtable = (Hashtable) hashtables.elementAt(numItemsInSet - ); // Generate all rules with one item in the consequence.
for (int i = ; i < m_items.length; i++)
if (m_items[i] != -) {
premise = new AprioriItemSet(m_totalTransactions);
consequence = new AprioriItemSet(m_totalTransactions);
premise.m_items = new int[m_items.length];
consequence.m_items = new int[m_items.length];
consequence.m_counter = m_counter; for (int j = ; j < m_items.length; j++)
consequence.m_items[j] = -;
System.arraycopy(m_items, , premise.m_items, , m_items.length);
premise.m_items[i] = -; consequence.m_items[i] = m_items[i];
premise.m_counter = ((Integer) hashtable.get(premise)).intValue();
premises.addElement(premise);
consequences.addElement(consequence);
conf.addElement(new Double(confidenceForRule(premise, consequence)));
}
rules[] = premises;
rules[] = consequences;
rules[] = conf;
pruneRules(rules, minConfidence); // Generate all the other rules
moreResults = moreComplexRules(rules, numItemsInSet, , minConfidence,
hashtables);
if (moreResults != null)
for (int i = ; i < moreResults[].size(); i++) {
rules[].addElement(moreResults[].elementAt(i));
rules[].addElement(moreResults[].elementAt(i));
rules[].addElement(moreResults[].elementAt(i));
}
return rules;
}
几个我想说的
1)不想输出为0的项,可以设置成缺失值?,因为算法会自动删除缺失值的列,不参与关联规则的生成;
2)按照置信度对关联规则排序,是关联规则分类器中使用的,只是提取关联规则,不需要排序;
备注
1)weka的关联规则中参数的详解
1. car 如果设为真,则会挖掘类关联规则而不是全局关联规则。也就是只保留与类标签有关的关联规则,设置索引为-1
. classindex 类属性索引。如果设置为-,最后的属性被当做类属性。
. delta 以此数值为迭代递减单位。不断减小支持度直至达到最小支持度或产生了满足数量要求的规则。
. lowerBoundMinSupport 最小支持度下界。
. metricType 度量类型。设置对规则进行排序的度量依据。可以是:置信度(类关联规则只能用置信度挖掘),提升度(lift),杠杆率(leverage),确信度(conviction)。
在 Weka中设置了几个类似置信度(confidence)的度量来衡量规则的关联程度,它们分别是:
a) Lift : P(A,B)/(P(A)P(B)) Lift=1时表示A和B独立。这个数越大(>),越表明A和B存在于一个购物篮中不是偶然现象,有较强的关联度.
b) Leverage :P(A,B)-P(A)P(B)Leverage=0时A和B独立,Leverage越大A和B的关系越密切
c) Conviction:P(A)P(!B)/P(A,!B) (!B表示B没有发生) Conviction也是用来衡量A和B的独立性。从它和lift的关系(对B取反,代入Lift公式后求倒数)可以看出,这个值越大, A、B越关联。
. minMtric 度量的最小值。
. numRules 要发现的规则数。
. outputItemSets 如果设置为真,会在结果中输出项集。
. removeAllMissingCols 移除全部为缺省值的列。 . significanceLevel 重要程度。重要性测试(仅用于置信度)。 . upperBoundMinSupport 最小支持度上界。 从这个值开始迭代减小最小支持度。 . verbose 如果设置为真,则算法会以冗余模式运行。
2)控制台输出结果
Apriori
======= Minimum support: 0.2 ( instances)
Minimum metric <confidence>: 0.9
Number of cycles performed: Generated sets of large itemsets: Size of set of large itemsets L(): Size of set of large itemsets L(): Size of set of large itemsets L(): Best rules found: . tear-prod-rate=reduced ==> contact-lenses=none conf:()
. spectacle-prescrip=myope tear-prod-rate=reduced ==> contact-lenses=none conf:()
. spectacle-prescrip=hypermetrope tear-prod-rate=reduced ==> contact-lenses=none conf:()
. astigmatism=no tear-prod-rate=reduced ==> contact-lenses=none conf:()
. astigmatism=yes tear-prod-rate=reduced ==> contact-lenses=none conf:()
. contact-lenses=soft ==> astigmatism=no conf:()
. contact-lenses=soft ==> tear-prod-rate=normal conf:()
. tear-prod-rate=normal contact-lenses=soft ==> astigmatism=no conf:()
. astigmatism=no contact-lenses=soft ==> tear-prod-rate=normal conf:()
. contact-lenses=soft ==> astigmatism=no tear-prod-rate=normal conf:()
转置请注明出处:http://www.cnblogs.com/rongyux/
数据挖掘:关联规则的apriori算法在weka的源码分析的更多相关文章
- SURF算法与源码分析、下
上一篇文章 SURF算法与源码分析.上 中主要分析的是SURF特征点定位的算法原理与相关OpenCV中的源码分析,这篇文章接着上篇文章对已经定位到的SURF特征点进行特征描述.这一步至关重要,这是SU ...
- mahout算法源码分析之Collaborative Filtering with ALS-WR (四)评价和推荐
Mahout版本:0.7,hadoop版本:1.0.4,jdk:1.7.0_25 64bit. 首先来总结一下 mahout算法源码分析之Collaborative Filtering with AL ...
- mahout算法源码分析之Collaborative Filtering with ALS-WR拓展篇
Mahout版本:0.7,hadoop版本:1.0.4,jdk:1.7.0_25 64bit. 额,好吧,心头的一块石头总算是放下了.关于Collaborative Filtering with AL ...
- mahout算法源码分析之Collaborative Filtering with ALS-WR 并行思路
Mahout版本:0.7,hadoop版本:1.0.4,jdk:1.7.0_25 64bit. mahout算法源码分析之Collaborative Filtering with ALS-WR 这个算 ...
- OpenCV学习笔记(27)KAZE 算法原理与源码分析(一)非线性扩散滤波
http://blog.csdn.net/chenyusiyuan/article/details/8710462 OpenCV学习笔记(27)KAZE 算法原理与源码分析(一)非线性扩散滤波 201 ...
- Mahout源码分析:并行化FP-Growth算法
FP-Growth是一种常被用来进行关联分析,挖掘频繁项的算法.与Aprior算法相比,FP-Growth算法采用前缀树的形式来表征数据,减少了扫描事务数据库的次数,通过递归地生成条件FP-tree来 ...
- diff.js 列表对比算法 源码分析
diff.js列表对比算法 源码分析 npm上的代码可以查看 (https://www.npmjs.com/package/list-diff2) 源码如下: /** * * @param {Arra ...
- Go合集,gRPC源码分析,算法合集
年初时,朋友圈见到的最多的就是新的一年新的FlAG,年末时朋友圈最多的也是xxxx就要过去了,你的FLAG实现了吗? 这个公众号2016就已经创建了,但截至今年之前从来没发表过文章,现在想想以前很忙, ...
- Ribbon源码分析(一)-- RestTemplate 以及自定义负载均衡算法
如果只是想看ribbon的自定义负载均衡配置,请查看: https://www.cnblogs.com/yangxiaohui227/p/13186004.html 注意: 1.RestTemplat ...
随机推荐
- KMP算法C代码
贴上C代码作参考,关于算法,可以参考网上的博文,但不要参考太多,一两篇相近的即可. #include <stdio.h> #include <stdlib.h> #includ ...
- Java线程常见面试题
v 多线程实现手段: (1).继承Thread类 (2)实现Runable接口 (3)使用线程池 v 线程控制在那个包:java.util.concurrent. (1)提供了线程的运行.(2)线程池 ...
- 关于解决web编码问题的总结
网页的编码问题,一般分为两个方面 1 是网页本身的编码格式, 一般不同的操作系统网页文件存取的编码是不一样的, 但一般来说, 新建网页文件一般都和IDE有关,因为我们平时我是使用编辑工具新建网页文件. ...
- 〈三〉ElasticSearch的认识:搜索、过滤、排序
目录 上节回顾 本节前言 文档的搜索 URL参数条件搜索 请求体条件搜索 语法与示例: 补充: 小节总结: 文档的过滤filter 语法与举例: filter与bool constant_score ...
- Junit测试Service类方法教程
Junit测试是很方便的,本博客记录一下Junit测试一些Service接口的方法,这样可以不运行项目,在@Test注解的方法里直接测试 Maven引入jar包: <properties> ...
- [VB.NET Tips]对多行文本的支持
从Visual Studio 2008开始VB.NET支持多行文本. 用法如下: Dim mString As String = <string>我是 一个多 行文本.</strin ...
- elasticsearch集群扩容和容灾
elasticsearch专栏:https://www.cnblogs.com/hello-shf/category/1550315.html 一.集群健康 Elasticsearch 的集群监控信息 ...
- [LeetCode]singleNumber
题目:singleNumber Given an array of integers, every element appears twice except for one. Find that si ...
- 高通电源管理qpnp-vm-bms驱动
1. compatible节点: qpnp-vm-bms.c使用来控制电池曲线的和BMS功能的,其compatible节点是"qcom,qpnp-vm-bms" 2. probe函 ...
- tomcat下c3p0连接池配置问题
一.首先如果要使用这个连接池,就需要导入c3p0-0.9.2-pre1.jar架包和支持架包mchange-commons-0.2.jar, 我这里测试使用的是msql数据库 当然也需要导入mysql ...