ID3算法

  • 思路:分类算法的输入为训练集,输出为对数据进行分类的函数。ID3算法为分类函数生成分类树
  • 需求:对水果训练集的一个维度(是否甜)进行预测
  • 实现:决策树,熵函数,ID3,weka库 J48类

ComputeGain.java

 1 public class ComputeGain {
2 public static void main(String[] args) {
3 System.out.printf("h(11,16) = %.4f%n", h(11,16));
4 System.out.println("Gain(Size):");
5 System.out.printf("\th(3,5) = %.4f%n", h(3,5));
6 System.out.printf("\th(6,7) = %.4f%n", h(6,7));
7 System.out.printf("\th(2,4) = %.4f%n", h(2,4));
8 System.out.printf("\tg({3,6,2},{5,7,4}) = %.4f%n",
9 g(new int[]{3,6,2},new int[]{5,7,4}));
10 System.out.println("Gain(Color):");
11 System.out.printf("\th(3,4) = %.4f%n", h(3,4));
12 System.out.printf("\th(3,5) = %.4f%n", h(3,5));
13 System.out.printf("\th(2,3) = %.4f%n", h(2,3));
14 System.out.printf("\th(2,4) = %.4f%n", h(2,4));
15 System.out.printf("\tg({3,3,2,2},{4,5,3,4}) = %.4f%n",
16 g(new int[]{3,3,2,2},new int[]{4,5,3,4}));
17 System.out.println("Gain(Surface):");
18 System.out.printf("\th(4,7) = %.4f%n", h(4,7));
19 System.out.printf("\th(4,6) = %.4f%n", h(4,6));
20 System.out.printf("\th(3,3) = %.4f%n", h(3,3));
21 System.out.printf("\tg({4,4,3},{7,6,3}) = %.4f%n",
22 g(new int[]{4,4,3},new int[]{7,6,3}));
23 System.out.println("Gain(Size|SMOOTH):");
24 System.out.printf("\th(1,3) = %.4f%n", h(1,3));
25 System.out.printf("\th(3,3) = %.4f%n", h(3,3));
26 System.out.printf("\tg({1,3,0},{3,3,1}) = %.4f%n",
27 g(new int[]{1,3,0},new int[]{3,3,1}));
28 System.out.println("Gain(Color|SMOOTH):");
29 System.out.printf("\th(2,3) = %.4f%n", h(2,3));
30 System.out.printf("\tg({2,2,0},{3,2,2}) = %.4f%n",
31 g(new int[]{2,2,0},new int[]{3,2,2}));
32 System.out.println("Gain(Size|ROUGH):");
33 System.out.printf("\th(3,6) = %.4f%n", h(3,6));
34 System.out.printf("\th(1,2) = %.4f%n", h(1,2));
35 System.out.printf("\tg({2,1,1},{2,2,2}) = %.4f%n",
36 g(new int[]{2,1,1},new int[]{2,2,2}));
37 System.out.println("Gain(Color|ROUGH):");
38 System.out.printf("\th(4,6) = %.4f%n", h(4,6));
39 System.out.printf("\tg({1,1,1},{2,2,2}) = %.4f%n",
40 g(new int[]{1,0,2,1},new int[]{1,2,2,1}));
41 }
42
43 /* Gain for the splitting {A1, A2, ...}, where Ai
44 has n[i] points, m[i] of which are favorable.
45 */
46 public static double g(int[] m, int[] n) {
47 int sm = 0, sn = 0;
48 double nsh = 0.0;
49 for (int i = 0; i < m.length; i++) {
50 sm += m[i];
51 sn += n[i];
52 nsh += n[i]*h(m[i],n[i]);
53 }
54 return h(sm, sn) - nsh/sn;
55 }
56
57 /* Entropy for m favorable items out of n.
58 */
59 public static double h(int m, int n) {
60 if (m == 0 || m == n) {
61 return 0;
62 }
63 double p = (double)m/n, q = 1 - p;
64 return -p*lg(p) - q*lg(q);
65 }
66
67 /* Returns the binary logarithm of x.
68 */
69 public static double lg(double x) {
70 return Math.log(x)/Math.log(2);
71 }
72 }

h(11,16) = 0.8960
Gain(Size):
h(3,5) = 0.9710
h(6,7) = 0.5917
h(2,4) = 1.0000
g({3,6,2},{5,7,4}) = 0.0838
Gain(Color):
h(3,4) = 0.8113
h(3,5) = 0.9710
h(2,3) = 0.9183
h(2,4) = 1.0000
g({3,3,2,2},{4,5,3,4}) = 0.0260
Gain(Surface):
h(4,7) = 0.9852
h(4,6) = 0.9183
h(3,3) = 0.0000
g({4,4,3},{7,6,3}) = 0.1206
Gain(Size|SMOOTH):
h(1,3) = 0.9183
h(3,3) = 0.0000
g({1,3,0},{3,3,1}) = 0.5917
Gain(Color|SMOOTH):
h(2,3) = 0.9183
g({2,2,0},{3,2,2}) = 0.5917
Gain(Size|ROUGH):
h(3,6) = 1.0000
h(1,2) = 1.0000
g({2,1,1},{2,2,2}) = 0.2516
Gain(Color|ROUGH):
h(4,6) = 0.9183
g({1,1,1},{2,2,2}) = 0.9183

 1 import weka.classifiers.trees.J48;
2 import weka.core.Instances;
3 import weka.core.Instance;
4 import weka.core.converters.ConverterUtils.DataSource;
5
6 public class TestWekaJ48 {
7 public static void main(String[] args) throws Exception {
8 DataSource source = new DataSource("data/AnonFruit.arff");
9 Instances instances = source.getDataSet();
10 instances.setClassIndex(3); // target attribute: (Sweet)
11
12 J48 j48 = new J48(); // an extension of ID3
13 j48.setOptions(new String[]{"-U"}); // use unpruned tree
14 j48.buildClassifier(instances);
15
16 for (Instance instance : instances) {
17 double prediction = j48.classifyInstance(instance);
18 System.out.printf("%4.0f%4.0f%n",
19 instance.classValue(), prediction);
20 }
21 }
22 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

贝叶斯分类

  • 思路:基于训练集计算的比率生成的函数进行分类

Fruit.java

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6
7 public class Fruit {
8 String name, size, color, surface;
9 boolean sweet;
10
11 public Fruit(String name, String size, String color, String surface,
12 boolean sweet) {
13 this.name = name;
14 this.size = size;
15 this.color = color;
16 this.surface = surface;
17 this.sweet = sweet;
18 }
19
20 @Override
21 public String toString() {
22 return String.format("%-12s%-8s%-8s%-8s%s",
23 name, size, color, surface, (sweet? "T": "F") );
24 }
25
26 public static Set<Fruit> loadData(File file) {
27 Set<Fruit> fruits = new HashSet();
28 try {
29 Scanner input = new Scanner(file);
30 for (int i = 0; i < 7; i++) { // read past metadata
31 input.nextLine();
32 }
33 while (input.hasNextLine()) {
34 String line = input.nextLine();
35 Scanner lineScanner = new Scanner(line);
36 String name = lineScanner.next();
37 String size = lineScanner.next();
38 String color = lineScanner.next();
39 String surface = lineScanner.next();
40 boolean sweet = (lineScanner.next().equals("T"));
41 Fruit fruit = new Fruit(name, size, color, surface, sweet);
42 fruits.add(fruit);
43 }
44 } catch (FileNotFoundException e) {
45 System.err.println(e);
46 }
47 return fruits;
48 }
49
50 public static void print(Set<Fruit> fruits) {
51 int k=1;
52 for (Fruit fruit : fruits) {
53 System.out.printf("%2d. %s%n", k++, fruit);
54 }
55 }
56 }

BayesianTest.java

 1 import java.io.File;
2 import java.util.Set;
3
4 public class BayesianTest {
5 private static Set<Fruit> fruits;
6
7 public static void main(String[] args) {
8 fruits = Fruit.loadData(new File("data/Fruit.arff"));
9 Fruit fruit = new Fruit("cola", "SMALL", "RED", "SMOOTH", false);
10 double n = fruits.size(); // total number of fruits in training set
11 double sum1 = 0; // number of sweet fruits
12 for (Fruit f : fruits) {
13 sum1 += (f.sweet? 1: 0);
14 }
15 double sum2 = n - sum1; // number of sour fruits
16 double[][] p = new double[4][3];
17 for (Fruit f : fruits) {
18 if (f.sweet) {
19 p[1][1] += (f.size.equals(fruit.size)? 1: 0)/sum1;
20 p[2][1] += (f.color.equals(fruit.color)? 1: 0)/sum1;
21 p[3][1] += (f.surface.equals(fruit.surface)? 1: 0)/sum1;
22 } else {
23 p[1][2] += (f.size.equals(fruit.size)? 1: 0)/sum2;
24 p[2][2] += (f.color.equals(fruit.color)? 1: 0)/sum2;
25 p[3][2] += (f.surface.equals(fruit.surface)? 1: 0)/sum2;
26 }
27 }
28 double pc1 = p[1][1]*p[2][1]*p[3][1]*sum1/n;
29 double pc2 = p[1][2]*p[2][2]*p[3][2]*sum2/n;
30 System.out.printf("pc1 = %.4f, pc2 = %.4f%n", pc1, pc2);
31 System.out.printf("Predict %s is %s.%n",
32 fruit.name, (pc1 > pc2? "sweet": "sour"));
33 }
34 }

pc1 = 0.0186, pc2 = 0.0150
Predict cola is sweet.

TestWekaBayes.java

 1 import java.util.List;
2 import weka.classifiers.Evaluation;
3 import weka.classifiers.bayes.NaiveBayes;
4 import weka.classifiers.evaluation.Prediction;
5 import weka.core.Instance;
6 import weka.core.Instances;
7 import weka.core.converters.ConverterUtils;
8 import weka.core.converters.ConverterUtils.DataSource;
9
10 public class TestWekaBayes {
11 public static void main(String[] args) throws Exception {
12 // ConverterUtils.DataSource source = new ConverterUtils.DataSource("data/AnonFruit.arff");
13 DataSource source = new DataSource("data/AnonFruit.arff");
14 Instances train = source.getDataSet();
15 train.setClassIndex(3); // target attribute: (Sweet)
16 //build model
17 NaiveBayes model=new NaiveBayes();
18 model.buildClassifier(train);
19
20 //use
21 Instances test = train;
22 Evaluation eval = new Evaluation(test);
23 eval.evaluateModel(model,test);
24 List <Prediction> predictions = eval.predictions();
25 int k = 0;
26 for (Instance instance : test) {
27 double actual = instance.classValue();
28 double prediction = eval.evaluateModelOnce(model, instance);
29 System.out.printf("%2d.%4.0f%4.0f", ++k, actual, prediction);
30 System.out.println(prediction != actual? " *": "");
31 }
32 }
33 }

1. 1 1
2. 1 1
3. 1 1
4. 1 1
5. 1 1
6. 0 1 *
7. 1 1
8. 0 0
9. 0 0
10. 0 1 *
11. 1 1
12. 1 1
13. 1 1
14. 1 1
15. 0 0
16. 1 1

SVM算法

  • 思路:生成超平面方程,计算数据点位于哪一边

逻辑回归

  • 思路:将目标值属性为布尔值的问题转化成一个数值变量,在转化后的问题上进行线性回归
  • 需求:某政党候选人想知道选举获胜的花费
  • 实现

 1 import org.apache.commons.math3.analysis.function.*;
2 import org.apache.commons.math3.stat.regression.SimpleRegression;
3
4 public class LogisticRegression {
5 static int n = 6;
6 static double[] x = {5, 15, 25, 35, 45, 55};
7 static double[] p = {2./6,2./5, 4./8, 5./9, 3./5, 4./5};
8 static double[] y = new double[n]; // y = logit(p)
9
10 public static void main(String[] args) {
11
12 // Transform p-values into y-values:
13 Logit logit = new Logit();
14 for (int i = 0; i < n; i++) {
15 y[i] = logit.value(p[i]);
16 }
17
18 // Set up input array for linear regression:
19 double[][] data = new double[n][n];
20 for (int i = 0; i < n; i++) {
21 data[i][0] = x[i];
22 data[i][1] = y[i];
23 }
24
25 // Run linear regression of y on x:
26 SimpleRegression sr = new SimpleRegression();
27 sr.addData(data);
28
29 // Print results:
30 for (int i = 0; i < n; i++) {
31 System.out.printf("x = %2.0f, y = %7.4f%n", x[i], sr.predict(x[i]));
32 }
33 System.out.println();
34
35 // Convert y-values back to p-values:
36 Sigmoid sigmoid = new Sigmoid();
37 for (int i = 0; i < n; i++) {
38 double p = sr.predict(x[i]);
39 System.out.printf("x = %2.0f, p = %6.4f%n", x[i], sigmoid.value(p));
40 }
41 }
42 }

x = 5, y = -0.7797
x = 15, y = -0.4067
x = 25, y = -0.0338
x = 35, y = 0.3392
x = 45, y = 0.7121
x = 55, y = 1.0851

x = 5, p = 0.3144
x = 15, p = 0.3997
x = 25, p = 0.4916
x = 35, p = 0.5840
x = 45, p = 0.6709
x = 55, p = 0.7475

k临近

  • 思路:根据临近范围内的样本进行分类

 1 import weka.classifiers.lazy.IBk;  // K-Nearest Neighbors
2 import weka.core.Instances;
3 import weka.core.Instance;
4 import weka.core.converters.ConverterUtils.DataSource;
5
6 public class TestIBk {
7 public static void main(String[] args) throws Exception {
8 DataSource source = new DataSource("data/AnonFruit.arff");
9 Instances instances = source.getDataSet();
10 instances.setClassIndex(3); // target attribute: (Sweet)
11
12 IBk ibk = new IBk();
13 ibk.buildClassifier(instances);
14
15 for (Instance instance : instances) {
16 double prediction = ibk.classifyInstance(instance);
17 System.out.printf("%4.0f%4.0f%n",
18 instance.classValue(), prediction);
19 }
20 }
21 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

[Java] 数据分析--分类的更多相关文章

  1. Java异常分类 转载

    Java异常分类 http://blog.csdn.net/woshixuye/article/details/8230407     一.基本概念 看java的异常结构图 Throwable是所有异 ...

  2. Java注释分类

    Java注释分类 1.单行注释    //打印结果    System.out.println("结果是:"+result); 2.多行注释    /**     * @autho ...

  3. Java应用分类

    Java应用分类     一.应用程序.指在操作系统上直接运行的,不是浏览器,Java环境用本机的,需要在客户端安装,Java环境可以一起安装.         1.GUI图形界面应用程序       ...

  4. Java凝视分类

    Java凝视分类 1.单行凝视    //打印结果    System.out.println("结果是:"+result); 2.多行凝视    /**     * @autho ...

  5. Java集合【1】--俯瞰java集合分类

    目录 (一) java集合分类 (1) Iterable接口 1. 内部定义的方法 1.1 iterator方法 1.2 forEach方法 1.3 spliterator方法 2. Collecti ...

  6. irms模拟数据生成及数据分析 分类: H_HISTORY 2015-03-06 14:17 212人阅读 评论(0) 收藏

    一.数据准备 1.每天生成随机一个文本,每小时向文本中追加2次数据,每次10万条 随机数据生成: 2,32  * * * *  bash /mnt/jediael/irms/signalGenerat ...

  7. 各种排序算法的分析及java实现 分类: B10_计算机基础 2015-02-03 20:09 186人阅读 评论(0) 收藏

    转载自:http://www.cnblogs.com/liuling/p/2013-7-24-01.html 另可参考:http://gengning938.blog.163.com/blog/sta ...

  8. Java 引用分类:StrongReference、SoftReference、WeakReference、PhantomReference

    一,定义 在Java中,引用的定义是:如果reference类型的数据中存储的数值代表的是另一块内存的起始地址,就称这块内存代表着一个引用.后面在JDK1.2开始,引用的概念被扩充,引用被分为强引用( ...

  9. Java异常分类

    一.基本概念 看java的异常结构图 Throwable是所有异常的根,java.lang.ThrowableError是错误,java.lang.ErrorException是异常,java.lan ...

随机推荐

  1. java例题_21 求 1+2!+3!+...+20!的和

    1 /*21 [程序 21 求阶乘] 2 题目:求 1+2!+3!+...+20!的和 3 程序分析:此程序只是把累加变成了累乘. 4 */ 5 6 /*分析 7 * 1.汲取上一题的教训,这么大的数 ...

  2. .Net Core 3.1浏览器后端服务(五) 引入定时任务Quartz.Net

    一.前言 近期项目中遇到一些需求,需要定时写入数据库,定时刷新缓存的问题,因此需要引入任务调度机制. 我的选择是使用 Quartz.Net,使用的版本是 3.2.4 这里强调一点:3.x的版本与2.x ...

  3. linux日志文件说明

    /var/log/message 系统启动后的信息和错误日志,是Red Hat Linux中最常用的日志之一 /var/log/secure 与安全相关的日志信息 /var/log/maillog 与 ...

  4. 201871030108-冯永萍 实验二 个人项目— D{0-1}背包问题项目报告

    项目 内容 课程班级博客链接 https://edu.cnblogs.com/campus/xbsf/2018CST 这个作业要求链接 https://www.cnblogs.com/nwnu-dai ...

  5. Spring Boot 2.3 新特性优雅停机详解

    什么是优雅停机 先来一段简单的代码,如下: @RestController public class DemoController { @GetMapping("/demo") p ...

  6. Spring Boot 实现配置文件加解密原理

    Spring Boot 配置文件加解密原理就这么简单 背景 接上文<失踪人口回归,mybatis-plus 3.3.2 发布>[1] ,提供了一个非常实用的功能 「数据安全保护」 功能,不 ...

  7. ISODateTimeFormat 转换2019-08-15T00:36:49.366456463Z 日期格式

    import java.text.*; import java.text.SimpleDateFormat; import java.util.*; import org.joda.time.Date ...

  8. Periodic Strings UVA - 455

    ​ A character string is said to have period k if it can be formed by concatenating one or more repet ...

  9. 爬虫框架-scrapy的使用

    Scrapy Scrapy是纯python实现的一个为了爬取网站数据.提取结构性数据而编写的应用框架. Scrapy使用了Twisted异步网络框架来处理网络通讯,可以加快我们的下载速度,并且包含了各 ...

  10. BB网络层测试

    网络层测试2020 问题 1 以下不属于网络层的协议是___A_______. A  DHCP  B ICMP  C IGMP  D ARP 10 分    问题 2 如果目的网络.目的主机都对,但是 ...