python数据处理与机器学习

提纲

numpy:

#genformtxt

import numpy as np

#genformtxtdata=np.genfromtxt("genfromtxtdata")

#print(help(numpy.genfromtxt))

#matrix-list of list

matrix=np.array([[12,12],[12,12],[1,13]])

print(matrix)

#强制转换成一致数据类型

dataa=np.array([1,2,4.0,1])

#切片

#判断

#datab=dataa

#结果返回true,false

#导出等于某一值的数组

#booldata=(datab==1)

#print(datab[booldata])

#取出包含某一值的某一行

boolmatrix =(matrix[:,1]==13)

print(matrix[boolmatrix,:])

# & | 与或

#类型转换

dataa.astype(float)

#求极值

dataa.min()

#按照行列求和

matrix.sum(axis=1)

#np.zeros((3,4)->元组格式)

#np.arange(15).reshape(3,4)

#np.random.random()->先进入random模块，默认范围-1->+1

#np.linspace(0,2*pi,100)->均匀取值

#np.exp()

#相减：维度一样对应相减，不一样都减去后一个数

A=np.array([[1,2],[1,1]])

B=np.array([[1,2],[1,1]])

print(A*B)#对应元素相乘

print(A.dot(B))#矩阵相乘

print(np.dot(A,B))

#矩阵操作

#向下取

a=np.floor(10*np.random.random((3,4)))

b=np.floor(10*np.random.random((3,4)))

#将矩阵拉成向量

print(a)

print(a.ravel())

#数据拼接

#print(np.hstack((a,b)))

#print(np.vstack((a,b)))

#数据切分

#print(np.hsplit(a,2))

#print(np.vsplit(a,2))

#数据复制

b=a

b.shape=4,3

#改变b的形状，a的形状跟着变了

print(a)

#a，b的ID值一样，指向统一内存空间

print(id(a),id(b))

#浅复制

#c与a虽然指向的地址不同但是共用一套数值，改变 c,a也会改变

c=a.view()

c.shape=2,6

c[1,1]=11

print(a.shape)

print(a)

#深复制

#d与a完全没关系了

d=a.copy()

#索引操作

#找最大值所在的位置

intt=a.argmax(axis=0)

print(intt)

#扩展数组

a=np.arange(1,20,10)

b=np.tile(a,(2,3))

print(b)

#排序

a=np.array([[1,2,3],[3,2,1]])

#从小到大的索引值

j=np.argsort(a)

a.sort(axis=1)

print(j)

print(a)

pandas:

import pandas as pd

import numpy as np

current_path = %pwd

print(current_path)

#food_info=pd.read_csv("food_info.csv")

#DataFrame数据类型

#print(type(food_info))

#print(food_info.dtypes)

#food_info.head()

#food_info.tail(4)

#print(food_info.columns)

#print(food_info.shape)

#索引与计算

#print(food_info.loc[0])

#传入一个list->多列

#print(food_info[["NDB_No","Shrt_Desc"]])

#column_list=food_info.columns.tolist()

#print(column_list)

##数据预处理

#food_info.sort_values("NDB_No",inplace=True)

##排序后缺失值会被放到最后

##从小到大排序

#print(food_info["NDB_No"])

##从大到小

#food_info.sort_values("NDB_No",inplace=True,ascending=False)

#print(food_info["NDB_No"])

titanic_train_info=pd.read_csv("titanic_train.csv")

#print(titanic_train_info.head())

#age=titanic_train_info["Age"]

#print(age.loc[0:10])

#age_is_null=pd.isnull(age)

#print(age_is_null)

#age_null_true=age[age_is_null]

#age_null_count=len(age_null_true)

#print(age_null_count)

#除去缺失值求平均

#age_null_false=titanic_train_info["Age"][age_is_null==False]

#average_age=sum(age_null_false)/len(age_null_false)

#average_age1=titanic_train_info["Age"].mean()

#print(average_age,average_age1)

#数据统计表

#基准-统计对象-方法

#求均值是默认方法

#passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)

#print(passager_survival)

#passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean)

#print(passager_age)

#port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)

#print(port_stats)

##缺失值丢掉

#titanic_train_info1=titanic_train_info

drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"])

drop_na_columns.head()

#定位到某一具体值

row_index_83_age=titanic_train_info1.loc[83,"Age"]

print(row_index_83_age)

#自定义函数

#titanic_train_info1.apply("函数名")

#series结构

import pandas as pd

score_csv=pd.read_csv("fandango_score_comparison.csv")

series_FILM=score_csv["FILM"]

#print(type(series_FILM))

from pandas import Series

film_names=series_FILM.values

#print(type(film_names))

series_rt=score_csv["RottenTomatoes"]

#print(series_rt)

rt_scores=series_rt.values

print(rt_scores)

#以名字所谓索引

series_customer=Series(rt_scores,index=film_names)

series_customer["Minions (2015)"]

series_customer[5:10]

matplotlib:

#折线图

import pandas as pd

unrate=pd.read_csv("UNRATE.csv")

unrate["DATE"]=pd.to_datetime(unrate["DATE"])

#print(unrate.head(12))

import matplotlib.pyplot as plt

#first_twelve=unrate[0:100]

#plt.plot(first_twelve["DATE"],first_twelve["VALUE"])

#plt.xticks(rotation=45)

#plt.xlabel("month")

#plt.ylabel("rate")

#plt.title("失业率")

#plt.show()

#fig=plt.figure()

#ax1=fig.add_subplot(4,3,1)

#ax2=fig.add_subplot(4,3,2)

#ax2=fig.add_subplot(4,3,6)

import numpy as np

#fig=plt.figure(figsize=(10,6))

#ax1=fig.add_subplot(2,1,1)

#ax2=fig.add_subplot(2,1,2)

#ax1.plot(np.random.randint(1,5,5),np.arange(5))

#ax2.plot(np.arange(10)*3,np.arange(10))

#plt.show()

unrate["Month"]=unrate["DATE"].dt.month

#fig=plt.figure(figsize=(6,3))

#plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red")

#plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue")

fig=plt.figure(figsize=(10,5))

colors=["red","blue","green","orange","black"]

for i in range(5):

    start_index=i*12

    end_index=(i+1)*12

    subset=unrate[start_index:end_index]

    label=str(1948+i)

    plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label)

plt.legend(loc="best")

plt.show()

#bar

import pandas as pd

reviews = pd.read_csv('fandango_scores.csv')

cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']

norm_reviews = reviews[cols]

#print(norm_reviews[:1])

import matplotlib.pyplot as plt

from numpy import arange

#The Axes.bar() method has 2 required parameters, left and height.

#We use the left parameter to specify the x coordinates of the left sides of the bar.

#We use the height parameter to specify the height of each bar

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']

bar_heights = norm_reviews.ix[0, num_cols].values

bar_positions = arange(5) + 0.75

tick_positions = range(1,6)

fig, ax = plt.subplots()

ax.bar(bar_positions, bar_heights, 0.5)

#横着画图

ax.barh(bar_positions, bar_heights, 0.5)

ax.set_xticks(tick_positions)

ax.set_xticklabels(num_cols, rotation=45)

ax.set_xlabel('Rating Source')

ax.set_ylabel('Average Rating')

ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')

plt.show()

#散点图

#Let's look at a plot that can help us visualize many points.

#函数返回一个figure图像和一个子图ax的array列表。

fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(2,1,1)

ax2 = fig.add_subplot(2,1,2)

ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])

ax1.set_xlabel('Fandango')

ax1.set_ylabel('Rotten Tomatoes')

ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])

ax2.set_xlabel('Rotten Tomatoes')

ax2.set_ylabel('Fandango')

plt.show()

#柱状图

import pandas as pd

import matplotlib.pyplot as plt

reviews = pd.read_csv('fandango_scores.csv')

cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']

norm_reviews = reviews[cols]

#print(norm_reviews[:5])

#数据计数

fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()

#数据索引从小到大排列

fandango_distribution = fandango_distribution.sort_index()

imdb_distribution = norm_reviews['IMDB_norm'].value_counts()

imdb_distribution = imdb_distribution.sort_index()

#print(fandango_distribution)

#print(imdb_distribution)

fig, ax = plt.subplots()

#ax.hist(norm_reviews['Fandango_Ratingvalue'])

#bins指定个数，range指定区间

ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)

ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)

ax.set_ylim(0,20)

#四分图（盒图）

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']

fig, ax = plt.subplots()

ax.boxplot(norm_reviews[num_cols].values)

ax.set_xticklabels(num_cols, rotation=90)

ax.set_ylim(0,5)

plt.show()

#一些细节

import pandas as pd

import matplotlib.pyplot as plt

# Add your code here.

fig, ax = plt.subplots()

ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women')

ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men')

#去掉小横线

ax.tick_params(bottom="off", top="off", left="off", right="off")

ax.set_title('Percentage of Biology Degrees Awarded By Gender')

ax.legend(loc="upper right")

major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']

fig = plt.figure(figsize=(12, 12))

#for sp in range(0,4):

#    ax = fig.add_subplot(2,2,sp+1)

#    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')

#    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')

#    # Add your code here.

#

## Calling pyplot.legend() here will add the legend to the last subplot that was created.

#plt.legend(loc='upper right')

#plt.show()

major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):

    ax = fig.add_subplot(2,2,sp+1)

    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')

    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(major_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

# Calling pyplot.legend() here will add the legend to the last subplot that was created.

plt.legend(loc='upper right')

plt.show()

#Setting Line Width

cb_dark_blue = (0/255, 107/255, 164/255)

cb_orange = (255/255, 128/255, 14/255)

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):

    ax = fig.add_subplot(2,2,sp+1)

    # Set the line width when specifying how each line should look.

    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)

    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(major_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

plt.legend(loc='upper right')

plt.show()

stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

fig = plt.figure(figsize=(18, 3))

for sp in range(0,6):

    ax = fig.add_subplot(1,6,sp+1)

    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)

    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)

    for key,spine in ax.spines.items():

        spine.set_visible(False)

    ax.set_xlim(1968, 2011)

    ax.set_ylim(0,100)

    ax.set_title(stem_cats[sp])

    ax.tick_params(bottom="off", top="off", left="off", right="off")

    if sp == 0:

        ax.text(2005, 87, 'Men')

        ax.text(2002, 8, 'Women')

    elif sp == 5:

        ax.text(2005, 62, 'Men')

        ax.text(2001, 35, 'Women')

plt.show()

seaborn:

#seaborn风格模板

import seaborn as sns

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

%matplotlib inline

def sinplot(flip=1):

    x=np.linspace(0,14,100)

    for i in range(1,7):

        plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)

#sns默认风格（有五种主题风格）

#sns.set()

#sinplot()

#sns.set_style("whitegrid")

#sns.set_style("dark")

#sns.set_style("white")

#sns.set_style("ticks")

#data=np.random.normal(size=(20,6))+np.arange(6)/2

#sns.boxplot(data=data)

#去掉上方和右边的线条

#sns.despine()

#sns.despine(offset=10)

#sns.despine(left=True)

#with内执行的都是当前风格

#with sns.axes_style("darkgrid"):

#    plt.subplot(211)

#    sinplot()

#plt.subplot(212)

#sinplot(-1)

##设置整体布局

sns.set_style("whitegrid")

sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook

plt.figure(figsize=(8,6))

sinplot()

# 颜色（离散型与连续型）

>颜色很重要

>color_palette()能传入任何matplot所支持的颜色

>color_palette()不写参数则默认颜色

>set_palette()设置所有图的颜色

#分类色板

#默认的绘图颜色

current_palette=sns.color_palette()

sns.palplot(current_palette)

#hls默认的颜色空间

sns.palplot(sns.color_palette("hls",8))

#把颜色放到数据中

fig=plt.figure(figsize=(10,6))

data=np.random.normal(size=(20,6))+np.arange(6)/2

sns.boxplot(data=data,palette=sns.color_palette("hls",8))

#更改调色板亮度与饱和度

#fig=plt.figure(figsize=(10,6))

#sns.palplot(sns.hls_palette(8,l=.2,h=.9))

#sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9))

#调出来成对的颜色

sns.palplot(sns.color_palette("Paired",8))

使用xkcd来命名颜色

xkcd包含了一套众包努力的针对随机GRB色的命名，产生了954个可以随时通过xkcd_rgb字典中调用的命名颜色

plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3)

plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3)

plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3)

#连续画板

#色彩可以变换，比如用颜色的变化表示值重要性的变化

sns.palplot(sns.color_palette("Blues"))

#由深到浅

sns.palplot(sns.color_palette("Blues_r"))

#线性调色板

sns.palplot(sns.color_palette("cubehelix",8))

sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75))

#指定颜色深浅

sns.palplot(sns.light_palette("green"))

sns.palplot(sns.dark_palette("purple"))

x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T

#plt.scatter(x,y)

fig=plt.figure(figsize=(10,6))

pal=sns.dark_palette("green",as_cmap=True)

sns.kdeplot(x,y,cmap=pal)

python数据处理与机器学习的更多相关文章

Python数据处理PDF
Python数据处理(高清版)PDF 百度网盘链接:https://pan.baidu.com/s/1h8a5-iUr4mF7cVujgTSGOA 提取码:6fsl 复制这段内容后打开百度网盘手机A ...
Python 数据处理库 pandas 入门教程
Python 数据处理库 pandas 入门教程2018/04/17 · 工具与框架 · Pandas, Python 原文出处: 强波的技术博客 pandas是一个Python语言的软件包,在我们使 ...
Python 数据处理库pandas教程（最后附上pandas_datareader使用实例）
0 简单介绍 pandas是一个Python语言的软件包,在我们使用Python语言进行机器学习编程的时候,这是一个非常常用的基础编程库.本文是对它的一个入门教程. pandas提供了快速,灵活和富有 ...
python+sklearn+kaggle机器学习
python+sklearn+kaggle机器学习系列教程 0.kaggle 1. 初级线性回归模型机器学习过程 a. 提取数据 b.数据预处理 c.训练模型 d.根据数据预测 e.验证今天是10 ...
入门系列之Scikit-learn在Python中构建机器学习分类器
欢迎大家前往腾讯云+社区,获取更多腾讯海量技术实践干货哦~ 本文由信姜缘发表于云+社区专栏介绍机器学习是计算机科学.人工智能和统计学的研究领域.机器学习的重点是训练算法以学习模式并根据数据进行预 ...
初识TPOT：一个基于Python的自动化机器学习开发工具
1. TPOT介绍一般来讲,创建一个机器学习模型需要经历以下几步: 数据预处理特征工程模型选择超参数调整模型保存本文介绍一个基于遗传算法的快速模型选择及调参的方法,TPOT:一种基于Pyt ...
如何用Python实现常见机器学习算法-1
最近在GitHub上学习了有关python实现常见机器学习算法目录一.线性回归 1.代价函数 2.梯度下降算法 3.均值归一化 4.最终运行结果 5.使用scikit-learn库中的线性模型实现 ...
参考《Python数据处理》中英文PDF+源代码
在实际操作中掌握数据处理方法,比较实用.采用基于项目的方法,介绍用Python完成数据获取.数据清洗.数据探索.数据呈现.数据规模化和自动化的过程.主要内容包括:Python基础知识,如何从CSV.E ...
python数据处理技巧二
python数据处理技巧二(掌控时间) 首先简单说下关于时间的介绍其中重点是时间戳的处理,时间戳是指格林威治时间1970年01月01日00时00分00秒(北京时间1970年01月01日08时00分00 ...

随机推荐

关于Matlab串口发送HEX格式字符
终于想起来更新一下关于使用Matlab串口发送HEX格式字符.这个用法主要来自于我使用Matlab对机器人进行实时轨迹跟踪的绘制,由于底层限制,自己又不想在中间增加转换模块,就需要直接发送HEX格式指 ...
IIS7如何实现访问HTTP跳转到HTTPS访问转的
加几句,1.安装url重写模块,不需要重启IIS,安装完了就能用.个人感觉比 IIS REWRITE组件更好用,iis rewrite是安装第三方的那种,不缴费只可以把所有规则写在一起,不能区别站点, ...
OpenERP 中国财务模块调整
最开始的模样是这个样子的后三行是没用的,于是在RML文件中注释掉相关的代码,改进后的界面如下: 这个样子看起来是好多了,但是数量跟是十亿千百的那块看起来还是很别扭,调整行高后的结果: 最诡异的事情 ...
SASS的安装和转换为CSS的方法
http://www.cnblogs.com/52css/archive/2012/08/19/sass-how-to-install-and-use.html SASS的安装方法: 1.先安装Rub ...
JAVA的NIO的新特性和小Demo,进一步了解NIO
1.为什么要用NIO NIO 的创建目的是为了让 Java 程序员可以实现高速 I/O 而无需编写自定义的本机代码.NIO 将最耗时的 I/O 操作(即填充和提取缓冲区)转移回操作系统,因而可以极大地 ...
Chapter 3. Lexical Structure
/** * Expression = Expression1 [ExpressionRest] * ExpressionRest = [AssignmentOperator Expression1] ...
JavaScript数据结构-13.散列碰撞（开链法）
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title> ...
Linux系统快速查找文件
有时候下载新的文件或安装新的包但是却搞不清默认放在哪个目录了,这个时候可以使用locate命令进行快速模糊查找比如我使用 go get github.com/coreos/bbolt/... 在一 ...
Go RabbitMQ（四）消息路由
RabbitMQ_Routing 本节内容我们将对发布订阅增加一个特性:订阅子集.比如我们将一些危险的错误消息保存进硬盘中,同时在控制台仍然能够读取所有的消息 Bingings 上一节内容我们将队列跟 ...
To B服务想做移动化？腾讯云案例了解一下
本文由腾讯云助手团队发布于腾讯云云+社区 | 导语:腾讯云那么多资源和服务,就不能手机管理吗? 能. 当用户在使用To B服务时,往往会遇到各种各样的限制: 1.操作难度:涉及各种权限.多重验证 ...

python数据处理与机器学习

numpy:

pandas:

matplotlib:

seaborn:

python数据处理与机器学习的更多相关文章

随机推荐

热门专题