数据结构

  • 键-值对:HashMap

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class HashMapExample {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.dat");
9 HashMap<String,Integer> dataset = new HashMap();
10 try {
11 Scanner input = new Scanner(dataFile);
12 while (input.hasNext()) {
13 String country = input.next();
14 int population = input.nextInt();
15 dataset.put(country, population);
16 }
17 } catch (FileNotFoundException e) {
18 System.out.println(e);
19 }
20 System.out.printf("dataset.size(): %d%n", dataset.size());
21 System.out.printf("dataset.get(\"Peru\"): %,d%n", dataset.get("Peru"));
22 }
23 }

文件处理

  • csv文件

    • 将Map数据存入csv文件  

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.FileOutputStream;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.Scanner;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import org.apache.poi.hssf.usermodel.HSSFRow;
10 import org.apache.poi.hssf.usermodel.HSSFSheet;
11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
12
13 public class FromMapToExcel {
14 public static void main(String[] args) {
15 Map<String,Integer> map = new TreeMap();
16 load(map, "data/Countries.dat");
17 print(map);
18 storeXL(map, "data/Countries.xls", "Countries Worksheet");
19 }
20
21 /** Loads the data from the specified file into the specified map.
22 */
23 public static void load(Map map, String fileSpec) {
24 File file = new File(fileSpec);
25 try {
26 Scanner input = new Scanner(file);
27 while (input.hasNext()) {
28 String country = input.next();
29 int population = input.nextInt();
30 map.put(country, population);
31 }
32 } catch (FileNotFoundException e) {
33 System.out.println(e);
34 }
35 }
36
37 public static void print(Map map) {
38 Set countries = map.keySet();
39 for (Object country : countries) {
40 Object population = map.get(country);
41 System.out.printf("%-10s%,12d%n", country, population);
42 }
43 }
44
45 /** Stores the specified map in the specified worksheet of
46 the specified Excel workbook file.
47 * @param map
48 * @param fileSpec
49 * @param sheet
50 */
51 public static void storeXL(Map map, String fileSpec, String sheet) {
52 try {
53 FileOutputStream out = new FileOutputStream(fileSpec);
54 HSSFWorkbook workbook = new HSSFWorkbook();
55 HSSFSheet worksheet = workbook.createSheet(sheet);
56 Set countries = map.keySet();
57 short rowNum = 0;
58 for (Object country : countries) {
59 Object population = map.get(country);
60 HSSFRow row = worksheet.createRow(rowNum);
61 row.createCell(0).setCellValue((String)country);
62 row.createCell(1).setCellValue((Integer)population);
63 ++rowNum;
64 }
65 workbook.write(out);
66 out.flush();
67 out.close();
68 } catch (FileNotFoundException e) {
69 System.err.println(e);
70 } catch (IOException e) {
71 System.err.println(e);
72 }
73 }
74 }
    • 读取csv文件

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class ReadingCSVFiles {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.csv");
9 try {
10 Scanner input = new Scanner(dataFile);
11 input.useDelimiter(",|\\s");
12 String column1 = input.next();
13 String column2 = input.next();
14 System.out.printf("%-10s%12s%n", column1, column2);
15 while (input.hasNext()) {
16 String country = input.next();
17 int population = input.nextInt();
18 System.out.printf("%-10s%,12d%n", country, population);
19 }
20 } catch (FileNotFoundException e) {
21 System.out.println(e);
22 }
23 }
24 }
    • 读取csv到Map

 1 import static dawj.ch02.FromMapToExcel.print;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.TreeMap;
7 import org.apache.poi.hssf.usermodel.HSSFCell;
8 import org.apache.poi.hssf.usermodel.HSSFRow;
9 import org.apache.poi.hssf.usermodel.HSSFSheet;
10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
11 import org.apache.poi.ss.usermodel.DataFormatter;
12 import org.apache.poi.ss.usermodel.Row;
13
14 public class FromExcelToMap {
15 public static void main(String[] args) {
16 Map map = loadXL("data/Countries.xls", "Countries Worksheet");
17 print(map);
18 }
19
20 /** Returns a Map object containing the data from the specified
21 worksheet in the specified Excel file.
22 */
23 public static Map loadXL(String fileSpec, String sheetName) {
24 Map<String,Integer> map = new TreeMap();
25 try {
26 FileInputStream stream = new FileInputStream(fileSpec);
27 HSSFWorkbook workbook = new HSSFWorkbook(stream);
28 HSSFSheet worksheet = workbook.getSheet(sheetName);
29 DataFormatter formatter = new DataFormatter();
30 for (Row row : worksheet) {
31 HSSFRow hssfRow = (HSSFRow)row;
32 HSSFCell cell = hssfRow.getCell(0);
33 String country = cell.getStringCellValue();
34 cell = hssfRow.getCell(1);
35 String str = formatter.formatCellValue(cell);
36 int population = (int)Integer.getInteger(str);
37 map.put(country, population);
38 }
39 } catch (FileNotFoundException e) {
40 System.err.println(e);
41 } catch (IOException e) {
42 System.err.println(e);
43 }
44 return map;
45 }
46 }
  • 解析JSON文件

 1 import java.io.File;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import javax.json.Json;
9 import javax.json.stream.JsonParser;
10 import javax.json.stream.JsonParser.Event;
11
12 public class ParsingJSONFiles {
13 public static void main(String[] args) {
14 File dataFile = new File("data/Books.json");
15 try {
16 InputStream stream = new FileInputStream(dataFile);
17 JsonParser parser = Json.createParser(stream);
18 Event event = parser.next(); // advance past START_OBJECT
19 HashMap<String,Object> map = getMap(parser);
20 System.out.println(map);
21 stream.close();
22 } catch (FileNotFoundException e) {
23 System.out.println(e);
24 } catch (IOException e) {
25 System.out.println(e);
26 }
27 }
28
29 /* Returns the HashMap parsed by the specified parser.
30 Called when event.equals(event.START_OBJECT):
31 */
32 public static HashMap getMap(JsonParser parser) {
33 HashMap<String,Object> map = new HashMap();
34 Event event = parser.next(); // advance past START_OBJECT
35 String key = parser.getString();
36 event = parser.next(); // advance past KEY_NAME
37 while (!event.equals(Event.END_OBJECT)) {
38 if (event.equals(Event.VALUE_STRING)) {
39 String value = parser.getString();
40 map.put(key, value);
41 } else if (event.equals(Event.VALUE_NUMBER)) {
42 Integer value = parser.getInt();
43 map.put(key, value);
44 } else if (event.equals(Event.START_ARRAY)) {
45 ArrayList<String> list = getList(parser);
46 map.put(key, list);
47 }
48 event = parser.next();
49 if (event.equals(Event.END_OBJECT)) {
50 break;
51 }
52 key = parser.getString();
53 event = parser.next();
54 }
55 return map;
56 }
57
58 /* Returns the ArrayList parsed by the specified parser.
59 Called when event.equals(event.START_ARRAY):
60 */
61 public static ArrayList getList(JsonParser parser) {
62 ArrayList list = new ArrayList();
63 Event event = parser.next(); // advance past START_ARRAY
64 while (!event.equals(Event.END_ARRAY)) {
65 if (event.equals(Event.VALUE_STRING)) {
66 list.add(parser.getString());
67 event = parser.next();
68 } else if (event.equals(Event.START_OBJECT)) {
69 HashMap<String,Object> map = getMap(parser);
70 list.add(map);
71 event = parser.next();
72 } else if (event.equals(Event.START_ARRAY)) {
73 ArrayList subList = getList(parser); // recursion
74 list.add(subList);
75 event = parser.next();
76 }
77 }
78 return list;
79 }
80 }

数据处理

  • 生成测试数据集

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Random;
5
6 public class GeneratingTestData {
7 private static final int ROWS = 8, COLS = 5;
8 private static final Random RANDOM = new Random();
9
10 public static void main(String[] args) {
11 File outputFile = new File("data/Output.csv");
12 try {
13 PrintWriter writer = new PrintWriter(outputFile);
14 for (int i = 0; i < ROWS; i++) {
15 for (int j = 0; j < COLS-1; j++) {
16 writer.printf("%.6f,", RANDOM.nextDouble());
17 }
18 writer.printf("%.6f%n", RANDOM.nextDouble());
19 }
20 writer.close();
21 } catch (FileNotFoundException e) {
22 System.err.println(e);
23 }
24 }
25 }
  • 数据过滤

    • 需求:选择国土面积超过100万平米的内陆国家
    • 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选

Country.java

 1 import java.util.HashSet;
2 import java.util.Scanner;
3
4 class Country {
5 protected String name;
6 protected int population;
7 protected int area;
8 protected boolean landlocked;
9
10 /* Constructs a new Country object from the next line being scanned.
11 If there are no more lines, the new object's fields are left null.
12 */
13 public Country(Scanner in) {
14 if (in.hasNextLine()) {
15 this.name = in.next();
16 this.population = in.nextInt();
17 this.area = in.nextInt();
18 this.landlocked = in.nextBoolean();
19 }
20 }
21
22 @Override
23 public String toString() {
24 return String.format("%-10s %,12d %,12d %b",
25 name, population, area, landlocked);
26 }
27 }

FilteringData.java

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6 import java.util.TreeMap;
7
8 public class FilteringData {
9 private static final int MIN_AREA = 1000000; // one million
10 public static void main(String[] args) {
11 File file = new File("data/Countries.dat");
12 Set<Country> dataset = readDataset(file);
13
14 for (Country country : dataset) {
15 if (country.landlocked && country.area >= MIN_AREA) {
16 System.out.println(country);
17 }
18 }
19 }
20
21 public static Set readDataset(File file) {
22 Set<Country> set = new HashSet();
23 try {
24 Scanner input = new Scanner(file);
25 input.nextLine(); // read past headers
26 while (input.hasNextLine()) {
27 set.add(new Country(input));
28 }
29 input.close();
30 } catch (FileNotFoundException e) {
31 System.out.println(e);
32 }
33 return set;
34 }
35 }
  • 排序

    • 需求:将contries.dat中数据按population进行排序
    • 实现:将数据存入TreeMap
    • 注意:关键字段必须唯一,即两个国家人口不能相同

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.Collections;
4 import java.util.HashMap;
5 import java.util.Scanner;
6 import java.util.Set;
7 import java.util.TreeMap;
8
9 public class SortingData {
10 public static void main(String[] args) {
11 File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
12 TreeMap<Integer,String> dataset = new TreeMap();
13 try {
14 Scanner input = new Scanner(file);
15 while (input.hasNext()) {
16 String x = input.next();
17 int y = input.nextInt();
18 dataset.put(y, x);
19 }
20 input.close();
21 } catch (FileNotFoundException e) {
22 System.out.println(e);
23 }
24 print(dataset);
25 }
26
27 public static void print(TreeMap<Integer,String> map) {
28 for (Integer key : map.keySet()) {
29 System.out.printf("%,12d %-16s%n", key, map.get(key));
30 }
31 }
32 }
  • 合并

    • 需求:将多个排好序的文件合并为单个排好序的文件
    • country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
    • 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可

Country.java

 1 import java.util.Scanner;
2
3 class Country implements Comparable{
4 protected String name;
5 protected int population;
6
7 /* Constructs a new Country object from the next line being scanned.
8 If there are no more lines, the new object's fields are left null.
9 */
10 public Country(Scanner in) {
11 if (in.hasNextLine()) {
12 this.name = in.next();
13 this.population = in.nextInt();
14 }
15 }
16
17 public boolean isNull(){
18 return this.name == null;
19 }
20
21 @Override
22 public int compareTo(Object object){
23 Country that = (Country)object;
24 return this.population - that.population;
25 }
26
27 @Override
28 public String toString() {
29 return String.format("%-10s %,12d",
30 name, population);
31 }
32 }

MergingFiles

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Scanner;
5
6 public class MergingFiles {
7 public static void main(String[] args) {
8 File inFile1 = new File("data/Countries1.dat");
9 File inFile2 = new File("data/Countries2.dat");
10 File outFile = new File("data/Countries.dat");
11 try {
12 Scanner in1 = new Scanner(inFile1);
13 Scanner in2 = new Scanner(inFile2);
14 PrintWriter out = new PrintWriter(outFile);
15 Country country1 = new Country(in1);
16 Country country2 = new Country(in2);
17 System.out.println(country1.hashCode());
18 System.out.println(country2.hashCode());
19 while (!country1.isNull() && !country2.isNull()) {
20 if (country1.compareTo(country2) < 0) {
21 out.println(country1);
22 country1 = new Country(in1);
23 } else {
24 out.println(country2);
25 country2 = new Country(in2);
26 }
27 }
28 while (!country1.isNull()) {
29 out.println(country1);
30 country1 = new Country(in1);
31 }
32 while (!country2.isNull()) {
33 out.println(country2);
34 country2 = new Country(in2);
35 }
36 in1.close();
37 in2.close();
38 out.close();
39 } catch (FileNotFoundException e) {
40 System.out.println(e);
41 }
42 }
43 }

[Java] 数据分析--数据预处理的更多相关文章

  1. pandas神器操作excel表格大全(数据分析数据预处理)

    使用pandas库操作excel,csv表格操作大全 关注公众号"轻松学编程"了解更多,文末有公众号二维码,可以扫码关注哦. 前言 准备三份csv表格做演示: 成绩表.csv su ...

  2. [Java]数据分析--数据可视化

    时间序列 需求:将一组字符顺序添加到时间序列中 实现:定义时间序列类TimeSeries,包含静态类Entry表示序列类中的各项,以及add,get,iterator,entry方法 TimeSeri ...

  3. 小白学 Python 数据分析(9):Pandas (八)数据预处理(2)

    人生苦短,我用 Python 前文传送门: 小白学 Python 数据分析(1):数据分析基础 小白学 Python 数据分析(2):Pandas (一)概述 小白学 Python 数据分析(3):P ...

  4. 【新人赛】阿里云恶意程序检测 -- 实践记录10.20 - 数据预处理 / 训练数据分析 / TF-IDF模型调参

    Colab连接与数据预处理 Colab连接方法见上一篇博客 数据预处理: import pandas as pd import pickle import numpy as np # 训练数据和测试数 ...

  5. EEGLAB数据分析:预处理与后续处理

    来源:http://blog.sina.com.cn/s/blog_13171a73d0102v4zx.html 数据预处理主要包括数据导入.电极定位.电极返回.滤波.去除伪迹.重建参考.分段.叠加平 ...

  6. Java大数据应用领域及就业方向

    最难毕业季,2017高校毕业生达到795万,许多学生面临着毕业即失业的尴尬.面对着与日俱增的竞争形势和就业压力,很多毕业生选择去知了堂学习社区镀金,以提高自己的就业竞争力,其中Java大数据是学生选择 ...

  7. Java大数据人才应用领域广,就业薪酬高

    互联网创造了大数据应用的规模化环境,大数据应用成功的案例大都是在互联网上发生的, 互联网业务提供了数据,互联网企业开发了处理软件,互联网企业的创新带来了大数据应用 的活跃,没有互联网便没有今天的大数据 ...

  8. 数据准备<3>:数据预处理

    数据预处理是指因为算法或者分析需要,对经过数据质量检查后的数据进行转换.衍生.规约等操作的过程.整个数据预处理工作主要包括五个方面内容:简单函数变换.标准化.衍生虚拟变量.离散化.降维.本文将作展开介 ...

  9. 【sklearn】数据预处理 sklearn.preprocessing

    数据预处理 标准化 (Standardization) 规范化(Normalization) 二值化 分类特征编码 推定缺失数据 生成多项式特征 定制转换器 1. 标准化Standardization ...

随机推荐

  1. maven 打包和构建的Linux命令(mvn)

    maven 打包构建相关命令 命令 mvn clean package 依次执行clean.resources.compile.testResources.testCompile.test.jar(打 ...

  2. 第30 章 : 理解 RuntimeClass 与使用多容器运行时

    理解 RuntimeClass 与使用多容器运行时 本文将主要分享以下三方面的内容: RuntimeClass 需求来源 RuntimeClass 功能介绍 多容器运行时示例 RuntimeClass ...

  3. element Notification 通知文字换行小技巧

    this.$notify({ title: "通知", message: res.result, iconClass: "el-icon-bell",//自定义 ...

  4. 201871030134-余宝鹏 实验三 结对项目—《D{0-1}KP 实例数据集算法实验平台》项目报告

    项目 内容 课程班级博客链接 班级博客 这个作业要求链接 作业要求 我的课程学习目标 1.体验软件项目开发中的两人合作,练习结对编程(Pair programming) 2.掌握GitHub协作开发程 ...

  5. 二、python学习-函数

    类型判断 1.type()直接获取类型 2.isinstance 用法一:isinstance(值,类型) 返回真或假 用法二:isinstance(值,(类型1,类型2 ...)) 有一个类型满足 ...

  6. Leedcode算法专题训练(位运算)

    https://www.cnblogs.com/findbetterme/p/10787118.html 看这个就完事了 1. 统计两个数的二进制表示有多少位不同 461. Hamming Dista ...

  7. .Net程序内存泄漏解析

    一.概要 大概在今年三月份的时候突然被紧急调到另外一个项目组解决线上内存泄漏问题.经过两周的玩命奋战终于解决了这个问题这里把心路历程及思路分享给大家.希望可以帮助到各位或现在正遇到这样事情的小伙伴提供 ...

  8. 手写Spring AOP,快来瞧一瞧看一看撒!

    目录 AOP分析 Advice实现 定义Advice接口 定义前置.后置.环绕和异常增强接口 Pointcut实现 定义PointCut接口 定义正则表达式的实现类:RegExpressionPoin ...

  9. C#开发医学影像胶片打印系统(一):万能花式布局的实现思路

    本篇文章将介绍开发医学影像胶片打印系统(printscu模式)遇到不规则排版时的一种思路, 一般来讲,医院打印胶片时都是整张胶片打印,但有时需要将多个病人或一个病人的多个检查打印在同一张胶片上, 这时 ...

  10. 【Navicat】获取表结构的DDL语句以及获取更新表字段的操作的DDL

    1.获取表结构的DDL语句 2.获取修改表结构某一字段的DDL语句  设计表-修改表字段(记住不要保存)-SQL预览