数据挖掘之pandas
sdata={'语文':89,'数学':96,'音乐':39,'英语':78,'化学':88}
#字典向Series转化 @@
>>> studata=Series(sdata)
>>> studata
化学 88
数学 96
英语 78
语文 89
音乐 39
dtype: int64
>>> obj=Series(sdata,index=['物理','数学','化学'])
>>> obj
物理 NaN[这个地方没有物理成绩所以是NaN,同时引起下面的数据是float型]
数学 96.0
化学 88.0
dtype: float64
#判断数据行中是否为空值
>>> pd.isnull(obj)
物理 True
数学 False
化学 False
dtype: bool
>>> pd.notnull(obj)
物理 False
数学 True
化学 True
dtype: bool
>>> obj.isnull()
物理 True
数学 False
化学 False
dtype: bool #对应数据相加
>>> en=Series([84,94,51,81],index=['张三','李四','王五','赵六'])
>>> sx=Series([94,81,31,91],index=['张三','赵六','王五','李四'])
>>> en+sx [相加时候索引自动对其]
张三 178
李四 185
王五 82
赵六 162
dtype: int64 #Series 的name 属性
>>> en.name='英语成绩'
>>> en
张三 84
李四 94
王五 51
赵六 81
Name: 英语成绩, dtype: int64
>>> en.index.name='姓名'
>>> en
姓名
张三 84
李四 94
王五 51
赵六 81
Name: 英语成绩, dtype: int64 #索引是可以修改的
>>> en.index=['zs','ll','ww','zl']
>>> en
zs 84
ll 94
ww 51
zl 81
Name: 英语成绩, dtype: int64 #############DataFrame############## >>> data={
'name':['张三','张三','张三','李四','李四','李四'],
'year':[2001,2002,2003,2001,2002,2003],
'weight':[54,50,60,61,63,65],
}
>>> frame=DataFrame(data)
>>> frame
name weight year
0 张三 54 2001
1 张三 50 2002
2 张三 60 2003
3 李四 61 2001
4 李四 63 2002
5 李四 65 2003 #columns可以修改显示顺序和选项
>>> DataFrame(data,columns=['year','weight','name'])
year weight name
0 2001 54 张三
1 2002 50 张三
2 2003 60 张三
3 2001 61 李四
4 2002 63 李四
5 2003 65 李四 >>> DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
year weight name sex
one 2001 54 张三 NaN
two 2002 50 张三 NaN
three 2003 60 张三 NaN
four 2001 61 李四 NaN
five 2002 63 李四 NaN
five 2003 65 李四 NaN #索引相同的情况查询,获取某一行或者几行
>>> a.ix['five']
year weight name sex
five 2002 63 李四 NaN
five 2003 65 李四 NaN #DataFrame-->Series 降维
#获取某一列
>>> info=DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
>>> info['name'] one 张三
two 张三
three 张三
four 李四
five 李四
five 李四
Name: name, dtype: object #列赋值
>>> info['sex']='男'
>>> info
year weight name sex
one 2001 54 张三 男
two 2002 50 张三 男
three 2003 60 张三 男
four 2001 61 李四 男
five 2002 63 李四 男
five 2003 65 李四 男 #列赋值-列值局部赋值
>>> val=Series(['man','woman','man'],index=['two','four','five'])
>>> info['sex']=val
>>> info
year weight name sex
one 2001 54 张三 NaN
two 2002 50 张三 man
three 2003 60 张三 NaN
four 2001 61 李四 woman
five 2002 63 李四 man
five 2003 65 李四 man #为不存在的列创建并赋值
>>> info['sexflag']=info.sex=='man'
>>> info
year weight name sex sexflag
one 2001 54 张三 NaN False
two 2002 50 张三 man True
three 2003 60 张三 NaN False
four 2001 61 李四 woman False
five 2002 63 李四 man True
five 2003 65 李四 man True #删除某一个列
>>> del info['sex']
>>> info
year weight name sexflag
one 2001 54 张三 False
two 2002 50 张三 True
three 2003 60 张三 False
four 2001 61 李四 False
five 2002 63 李四 True
five 2003 65 李四 True #嵌套字典-----convert--->DataFrame
#外层的key是列;内层的key是行
>>> studata={'张三':{'语文':91,'数学':99,'物理':90},'李四':{'语文':31,'数学':65,'物理':45}}
>>> info2=DataFrame(studata)
>>> info2
张三 李四
数学 99 65
物理 90 45
语文 91 31
>>> info2.T
数学 物理 语文
张三 99 90 91
李四 65 45 31 #index.name columns.name 属性
>>> info
year weight name sexflag
one 2001 54 张三 False
two 2002 50 张三 True
three 2003 60 张三 False
four 2001 61 李四 False
five 2002 63 李四 True
five 2003 65 李四 True
>>> info.index.name='个人信息'
>>> info.columns.name='索引'
>>> info
索引 year weight name sexflag
个人信息
one 2001 54 张三 False
two 2002 50 张三 True
three 2003 60 张三 False
four 2001 61 李四 False
five 2002 63 李四 True
five 2003 65 李四 True >>> info.index
Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'个人信息')
#集合去重复
>>> info.index.unique
<bound method Index.unique of Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'个人信息')>
>>> info.index.unique()
array(['one', 'two', 'three', 'four', 'five'], dtype=object)
#是否唯一
>>> info.index.is_unique
False
#当各元素均大于等于前一个元素时候,返回True
>>> DataFrame(range(1,4),index=range(1,4)).index.is_monotonic
True
>>> info.index.is_monotonic
False
#删除传入的值并得到新的index
>>> DataFrame(range(1,4),index=range(1,4)).index.drop(1)
Int64Index([2, 3], dtype='int64') >>> obj=Series([33,23],index=['a','b'])
>>> obj
a 33
b 23
dtype: int64
>>> obj2=obj.reindex(['b','a','c'])
>>> obj2
b 23.0
a 33.0
c NaN
dtype: float64
>>> obj2=obj.reindex(['b','a','c'],fill_value=0)
>>> obj2
b 23
a 33
c 0
dtype: int64 >>> obj3=Series(['blue','purple','yellow'],index=[0,2,4])
>>> obj3
0 blue
2 purple
4 yellow
dtype: object
#ffill前向值填充
>>> obj3.reindex(range(6),method='ffill')
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
#bfill后向填充
>>> obj3.reindex(range(6),method='bfill')
0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object >>> frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','d'],columns=['Ohio','Texas','california'])
>>> frame
Ohio Texas california
a 0 1 2
b 3 4 5
d 6 7 8
#重新索引行
>>> frame2=frame.reindex(['a','b','c','d'])
>>> frame2
Ohio Texas california
a 0.0 1.0 2.0
b 3.0 4.0 5.0
c NaN NaN NaN
d 6.0 7.0 8.0
#重新索引列
>>> cols=['Texas','Ohio','uknown']
>>> frame.reindex(columns=cols)
Texas Ohio uknown
a 1 0 NaN
b 4 3 NaN
d 7 6 NaN >>> frame.reindex(index=['a','b','c','d'],method='ffill',columns=cols)
Texas Ohio uknown
a 1 0 NaN
b 4 3 NaN
c 4 3 NaN
d 7 6 NaN
>>> frame.ix[['a','b','c','d'],cols]
Texas Ohio uknown
a 1.0 0.0 NaN
b 4.0 3.0 NaN
c NaN NaN NaN
d 7.0 6.0 NaN >>> data
Texas Ohio uknown
a 1.0 0.0 NaN
b 4.0 3.0 NaN
c NaN NaN NaN
d 7.0 6.0 NaN
#删除行
>>> data.drop(['c','b'])
Texas Ohio uknown
a 1.0 0.0 NaN
d 7.0 6.0 NaN
>>> data.drop('uknown',axis=1)
Texas Ohio
a 1.0 0.0
b 4.0 3.0
c NaN NaN
d 7.0 6.0 #列的条件查询
>>> info[info['weight']>60]
索引 year weight name sexflag
个人信息
four 2001 61 李四 False
five 2002 63 李四 True
five 2003 65 李四 True #
>>> info.ix['one',['name','year']]
索引
name 张三
year 2001
Name: one, dtype: object >>> data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],columns=['one','two','three','four'])
>>> data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
NewYork 12 13 14 15
>>> data['two']
Ohio 1
Colorado 5
Utah 9
NewYork 13
Name: two, dtype: int64
>>> data[['three','one']]
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
NewYork 14 12
>>>
>>> data[:2]
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
>>> data[data['three']>5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
NewYork 12 13 14 15 >>> data<5
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
NewYork False False False False
>>> data[data<5]=0
>>> data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
NewYork 12 13 14 15 #行列组合查询
>>> data.ix['Colorado',['two','three']]
two 5
three 6
Name: Colorado, dtype: int64
>>> data.ix[['Colorado','Utah'],[3,0,1]]
four one two
Colorado 7 0 5
Utah 11 8 9 >>> data.ix[:'Utah','two']
Ohio 0
Colorado 5
Utah 9
Name: two, dtype: int64
>>> >>> data.ix[data.three>5,:3]
one two three
Colorado 0 5 6
Utah 8 9 10
NewYork 12 13 14 #obj[val] 选取DataFrame的单个列或一组列。在一些特殊情况下会比较便利
#obj.ix[val] 选取DataFrame的单个行或一组行
#obj.ix[:,val] 选取单个列或列子集
#obj.ix[val1,val2] 同时选取行和列
#reindex 新索引 #DataFrame的数据对齐
>>> df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['good','bad','normal'])
>>> df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','normal','bad','supper'])
>>> df1
b c d
good 0.0 1.0 2.0
bad 3.0 4.0 5.0
normal 6.0 7.0 8.0
>>> df2
b d e
good 0.0 1.0 2.0
normal 3.0 4.0 5.0
bad 6.0 7.0 8.0
supper 9.0 10.0 11.0 >>> df1+df2
b c d e
bad 9.0 NaN 12.0 NaN
good 0.0 NaN 3.0 NaN
normal 9.0 NaN 12.0 NaN
supper NaN NaN NaN NaN #没有的值使用0填充
>>> df1.add(df2,fill_value=0)
b c d e
bad 9.0 4.0 12.0 8.0
good 0.0 1.0 3.0 2.0
normal 9.0 7.0 12.0 5.0
supper 9.0 NaN 10.0 11.0
#索引reindex 的填充
>>> df1.reindex(columns=df2.columns,fill_value=0)
b d e
good 0.0 2.0 0
bad 3.0 5.0 0
normal 6.0 8.0 0 #其他的算术方法:
add +
sub -
div /
mul * DataFrame和Series的运算
>>> frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','bad','supper','uknown'])
>>> frame
b d e
good 0.0 1.0 2.0
bad 3.0 4.0 5.0
supper 6.0 7.0 8.0
uknown 9.0 10.0 11.0
>>> series=frame.ix[0]
>>> series
b 0.0
d 1.0
e 2.0
Name: good, dtype: float64
>>>
>>> frame-series
b d e
good 0.0 0.0 0.0
bad 3.0 3.0 3.0
supper 6.0 6.0 6.0
uknown 9.0 9.0 9.0 #frame 和 serie运算出现广播现象
>>> series2=Series(range(3),index=[list('bef')])
>>> series2
b 0
e 1
f 2
dtype: int64
>>> frame+series2
b d e f
good 0.0 NaN 3.0 NaN
bad 3.0 NaN 6.0 NaN
supper 6.0 NaN 9.0 NaN
uknown 9.0 NaN 12.0 NaN #在列上广播
>>> frame.sub(series3,axis=0)
b d e
good -1.0 0.0 1.0
bad -1.0 0.0 1.0
supper -1.0 0.0 1.0
uknown -1.0 0.0 1.0 >>> frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['good','bad','nice','supper'])
>>> frame
b d e
good 0.428420 -0.951975 0.862226
bad -0.666254 -0.988423 2.442255
nice 1.617591 0.377867 -1.069077
supper -1.417150 0.449853 0.685007
#全部转换成正数
>>> np.abs(frame)
b d e
good 0.428420 0.951975 0.862226
bad 0.666254 0.988423 2.442255
nice 1.617591 0.377867 1.069077
supper 1.417150 0.449853 0.685007 >>> f=lambda x: x.max()-x.min()
>>> frame.apply(f,axis=0)
b 3.034740
d 1.438276
e 3.511332
dtype: float64
>>> frame.apply(f,axis=1)
good 1.814201
bad 3.430677
nice 2.686668
supper 2.102157
dtype: float64 >>> def f(x):return Series([x.min(),x.max()],index=['min','max'])
...
>>> frame.apply(f)
b d e
min -1.417150 -0.988423 -1.069077
max 1.617591 0.449853 2.442255 #格式化内容
>>> format=lambda x:'%.2f' % x
>>> frame.applymap(format)
b d e
good 0.43 -0.95 0.86
bad -0.67 -0.99 2.44
nice 1.62 0.38 -1.07
supper -1.42 0.45 0.69 #############排序和排名#############
#ascending 升序还是降
>>> frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=[list('nalv')])
>>> frame
n a l v
three 0 1 2 3
one 4 5 6 7
>>> frame.sort_index()
n a l v
one 4 5 6 7
three 0 1 2 3
>>> frame.sort_index(axis=1)
a l n v
three 1 2 0 3
one 5 6 4 7
>>> frame.sort_index(axis=1,ascending=False)
v n l a
three 3 0 2 1
one 7 4 6 5 >>> obj=Series([4,5,-3,2])
>>> obj.order()
2 -3
3 2
0 4
1 5
dtype: int64 #指定列v倒叙排
>>> frame.sort_index(axis=0,ascending=False,by='v')
n a l v
one 4 5 6 7
three 0 1 2 3 >>> frame.sort_index(axis=0,ascending=False,by=['v','l'])
n a l v
one 4 5 6 7
three 0 1 2 3 >>> obj=Series([7,-5,7,4,2,0,4])
>>> obj.rank(method='first')
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
>>> obj.rank(ascending=False,method='max')
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64 >>> DataFrame(studata).T
数学 物理 语文
张三 99 90 91
李四 65 45 31
>>> DataFrame(studata).T.rank(axis=1,ascending=False)
数学 物理 语文
张三 1.0 3.0 2.0
李四 1.0 2.0 3.0
>>> DataFrame(studata).T.rank(axis=0,ascending=False)
数学 物理 语文
张三 1.0 1.0 1.0
李四 2.0 2.0 2.0 >>> datastu=pd.read_csv('/Users/similarface/Downloads/jnn.csv')
>>> datastu
准考证号 姓名 班级 语文 数学 英语 化学 物理
0 304040250124 罗茜 1 101.0 94 102.5 79 74
1 304040250128 沈怡君 1 91.5 96 69.0 82 69
2 304040250321 魏华 2 74.0 28 42.0 56 56
3 304040250233 何仕林 2 60.5 42 34.5 49 46
4 304040250725 屈妮 5 93.5 63 77.5 55 66
5 304040250709 邓培蓓 5 102.5 81 47.0 65 58
6 304040250805 郑清霞 5 89.0 80 63.5 63 65
7 304040250827 明杨 6 108.5 92 79.0 89 83
8 304040250819 李倩 6 93.5 61 44.0 45 32
9 304040250912 江明悦 6 0.0 0 0.0 0 0 >>> datastu.rank(axis=1,ascending=False,method='min')
准考证号 姓名 班级 语文 数学 英语 化学 物理
0 2.0 1.0 8.0 4.0 5.0 3.0 6.0 7.0
1 2.0 1.0 8.0 4.0 3.0 6.0 5.0 6.0
2 2.0 1.0 8.0 3.0 7.0 6.0 4.0 4.0
3 2.0 1.0 8.0 3.0 6.0 7.0 4.0 5.0
4 2.0 1.0 8.0 3.0 6.0 4.0 7.0 5.0
5 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.0
6 2.0 1.0 8.0 3.0 4.0 6.0 7.0 5.0
7 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.0
8 2.0 1.0 8.0 3.0 4.0 6.0 5.0 7.0
9 2.0 1.0 3.0 4.0 4.0 4.0 4.0 4.0
>>> datastu.rank(axis=0,ascending=False,method='min')
准考证号 姓名 班级 语文 数学 英语 化学 物理
0 10.0 4.0 9.0 3.0 2.0 1.0 3.0 2.0
1 9.0 5.0 9.0 6.0 1.0 4.0 2.0 3.0
2 7.0 1.0 7.0 8.0 9.0 8.0 6.0 7.0
3 8.0 10.0 7.0 9.0 8.0 9.0 8.0 8.0
4 5.0 9.0 4.0 4.0 6.0 3.0 7.0 4.0
5 6.0 3.0 4.0 2.0 4.0 6.0 4.0 6.0
6 4.0 2.0 4.0 7.0 5.0 5.0 5.0 5.0
7 2.0 8.0 1.0 1.0 3.0 2.0 1.0 1.0
8 3.0 7.0 1.0 4.0 7.0 7.0 9.0 9.0
9 1.0 6.0 1.0 10.0 10.0 10.0 10.0 10.0 >>> data=datastu[['语文','数学','物理','英语','化学']]
>>> data
语文 数学 物理 英语 化学
0 101.0 94 74 102.5 79
1 91.5 96 69 69.0 82
2 74.0 28 56 42.0 56
3 60.5 42 46 34.5 49
4 93.5 63 66 77.5 55
5 102.5 81 58 47.0 65
6 89.0 80 65 63.5 63
7 108.5 92 83 79.0 89
8 93.5 61 32 44.0 45
9 0.0 0 0 0.0 0 >>> data.sum()
语文 814.0
数学 637.0
物理 549.0
英语 559.0
化学 583.0
dtype: float64 >>> data.sum(axis=1)
0 450.5
1 407.5
2 256.0
3 232.0
4 355.0
5 353.5
6 360.5
7 451.5
8 275.5
9 0.0
dtype: float64 #axis
#skipna 排除缺失值NAN
#level >>> data
语文 数学 物理 英语 化学
0 101.0 94 74 102.5 79
1 91.5 96 69 69.0 82
2 74.0 28 56 42.0 56
3 60.5 42 46 34.5 49
4 93.5 63 66 77.5 55
5 102.5 81 58 47.0 65
6 89.0 80 65 63.5 63
7 108.5 92 83 79.0 89
8 93.5 61 32 44.0 45
9 0.0 0 0 0.0 0
#返回间接统计
>>> data.idxmax()
语文 7 最高分数的索引在7
数学 1 最高分数的索引在1
物理 7 最高分数的索引在7
英语 0 最高分数的索引在0
化学 7 最高分数的索引在7
dtype: int64
#累和
>>> data.cumsum()
语文 数学 物理 英语 化学
0 101.0 94.0 74.0 102.5 79.0
1 192.5 190.0 143.0 171.5 161.0
2 266.5 218.0 199.0 213.5 217.0
3 327.0 260.0 245.0 248.0 266.0
4 420.5 323.0 311.0 325.5 321.0
5 523.0 404.0 369.0 372.5 386.0
6 612.0 484.0 434.0 436.0 449.0
7 720.5 576.0 517.0 515.0 538.0
8 814.0 637.0 549.0 559.0 583.0
9 814.0 637.0 549.0 559.0 583.0 >>> data.describe()
语文 数学 物理 英语 化学
count 10.000000 10.00000 10.000000 10.000000 10.000000
mean 81.400000 63.70000 54.900000 55.900000 58.300000
std 31.857146 31.86447 24.052951 28.670349 25.117723
min 0.000000 0.00000 0.000000 0.000000 0.000000
25% 77.750000 46.75000 48.500000 42.500000 50.500000
50% 92.500000 71.50000 61.500000 55.250000 59.500000
75% 99.125000 89.25000 68.250000 75.375000 75.500000
max 108.500000 96.00000 83.000000 102.500000 89.000000 ''' DataFrame.abs() Return an object with absolute value taken–only applicable to objects that are all numeric.
DataFrame.all([axis, bool_only, skipna, level]) Return whether all elements are True over requested axis
DataFrame.any([axis, bool_only, skipna, level]) Return whether any element is True over requested axis
DataFrame.clip([lower, upper, out, axis]) Trim values at input threshold(s).
DataFrame.clip_lower(threshold[, axis]) Return copy of the input with values below given value(s) truncated.
DataFrame.clip_upper(threshold[, axis]) Return copy of input with values above given value(s) truncated.
DataFrame.corr([method, min_periods]) Compute pairwise correlation of columns, excluding NA/null values
DataFrame.corrwith(other[, axis, drop]) Compute pairwise correlation between rows or columns of two DataFrame objects.
DataFrame.count([axis, level, numeric_only]) Return Series with number of non-NA/null observations over requested axis.
DataFrame.cov([min_periods]) Compute pairwise covariance of columns, excluding NA/null values
DataFrame.cummax([axis, dtype, out, skipna]) Return cumulative max over requested axis.
DataFrame.cummin([axis, dtype, out, skipna]) Return cumulative min over requested axis.
DataFrame.cumprod([axis, dtype, out, skipna]) Return cumulative prod over requested axis.
DataFrame.cumsum([axis, dtype, out, skipna]) Return cumulative sum over requested axis.
DataFrame.describe([percentiles, include, ...]) Generate various summary statistics, excluding NaN values.
一阶差分(时间序列很有用)DataFrame.diff([periods, axis]) 1st discrete difference of object
DataFrame.eval(expr[, inplace]) Evaluate an expression in the context of the calling DataFrame instance.
样本的峰度(四阶矩)DataFrame.kurt([axis, skipna, level, ...]) Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
平均绝对离差DataFrame.mad([axis, skipna, level]) Return the mean absolute deviation of the values for the requested axis
DataFrame.max([axis, skipna, level, ...]) This method returns the maximum of the values in the object.
DataFrame.mean([axis, skipna, level, ...]) Return the mean of the values for the requested axis
DataFrame.median([axis, skipna, level, ...]) Return the median of the values for the requested axis
DataFrame.min([axis, skipna, level, ...]) This method returns the minimum of the values in the object.
DataFrame.mode([axis, numeric_only]) Gets the mode(s) of each element along the axis selected.
百分数变化DataFrame.pct_change([periods, fill_method, ...]) Percent change over given number of periods.
DataFrame.prod([axis, skipna, level, ...]) Return the product of the values for the requested axis
DataFrame.quantile([q, axis, numeric_only, ...]) Return values at the given quantile over requested axis, a la numpy.percentile.
DataFrame.rank([axis, method, numeric_only, ...]) Compute numerical data ranks (1 through n) along axis.
DataFrame.round([decimals, out]) Round a DataFrame to a variable number of decimal places.
DataFrame.sem([axis, skipna, level, ddof, ...]) Return unbiased standard error of the mean over requested axis.
样本值的偏度(三阶矩)DataFrame.skew([axis, skipna, level, ...]) Return unbiased skew over requested axis
DataFrame.sum([axis, skipna, level, ...]) Return the sum of the values for the requested axis
标准差DataFrame.std([axis, skipna, level, ddof, ...]) Return sample standard deviation over requested axis.
方差DataFrame.var([axis, skipna, level, ddof, ...]) Return unbiased variance over requested axis.
''' >>> import pandas.io.data as web
>>> all_data={}
>>> for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
>>> price=DataFrame({tic:data['Adj Close'] for tic ,data in all_data.iteritems()})
>>> volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
>>> returns=price.pct_change()
>>> returns.tail()
AAPL GOOG IBM MSFT
Date
2009-12-24 0.034339 0.011117 0.004385 0.002587
2009-12-28 0.012294 0.007098 0.013326 0.005484
2009-12-29 -0.011861 -0.005571 -0.003477 0.007058
2009-12-30 0.012147 0.005376 0.005461 -0.013699
2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
#计算相关系数
>>> returns.IBM.corr(returns.GOOG)
0.39068882087254675
>>> returns.corrwith(returns.IBM)
AAPL 0.410011
GOOG 0.390689
IBM 1.000000
MSFT 0.495980
dtype: float64 >>> returns.corrwith(volume)
AAPL -0.057549
GOOG 0.062647
IBM -0.007892
MSFT -0.014245
dtype: float64 >>> obj=Series(['c','b','c','c','d','a','g','b'])
>>> obj.value_counts()
c 3
b 2
g 1
d 1
a 1
dtype: int64
>>> pd.value_counts(obj.values,sort=False)
a 1
c 3
b 2
d 1
g 1
dtype: int64 #是否存在
>>> mask=obj.isin(['b','c'])
>>> mask
0 True
1 True
2 True
3 True
4 False
5 False
6 False
7 True
dtype: bool >>> obj[mask]
0 c
1 b
2 c
3 c
7 b
dtype: object #频度柱状图
>>> data=DataFrame({'Qu1':[1,3,4,5,3],'Qu2':[2,4,1,2,4],'Qu3':[3,4,2,1,1]})
>>> data
Qu1 Qu2 Qu3
0 1 2 3
1 3 4 4
2 4 1 2
3 5 2 1
4 3 4 1
>>> data.apply(pd.value_counts).fillna(0)
Qu1 Qu2 Qu3
1 1.0 1.0 2.0
2 0.0 2.0 1.0
3 2.0 0.0 1.0
4 1.0 2.0 1.0
5 1.0 0.0 0.0 #缺失数据处理
>>> string_data=Series(['张三','李四',np.nan,'赵六'])
>>> string_data
0 张三
1 李四
2 NaN
3 赵六
dtype: object
>>> string_data.isnull()
0 False
1 False
2 True
3 False
dtype: bool ######过滤数据过滤缺失数据
>>> from numpy import nan as NA
>>> data=Series([1,NA,3.5,NA,7])
>>> data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
>>> data
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
>>> data[data.notnull()]
0 1.0
2 3.5
4 7.0
dtype: float64 #DataFrame默认删除只要包含NA的行
>>> data=DataFrame([[1.,6.5,3.],[1,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
>>> data
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
>>> data.dropna()
0 1 2
0 1.0 6.5 3.0
#how='all'
>>> data.dropna(how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0 #删除列全是null的
>>> data
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN
>>> data.dropna(axis=1,how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0 #thresh 表示空值的个数
>>> df.dropna(thresh=3)
0 1 2
5 0.519277 1.182077 -0.500918
6 -0.050867 -0.051302 1.368309 #填充缺失数据
>>> df.fillna(-1)
0 1 2
0 0.581403 -1.000000 -1.000000
1 -1.709160 -1.000000 -1.000000
2 2.496074 -1.000000 -1.000000
3 0.329339 -1.000000 0.736299
4 -0.638106 -1.000000 0.756044
5 0.519277 1.182077 -0.500918
6 -0.050867 -0.051302 1.368309
#指定列的填充
>>> df.fillna({1:0.5,3:-1})
0 1 2
0 0.581403 0.500000 NaN
1 -1.709160 0.500000 NaN
2 2.496074 0.500000 NaN
3 0.329339 0.500000 0.736299
4 -0.638106 0.500000 0.756044
5 0.519277 1.182077 -0.500918
6 -0.050867 -0.051302 1.368309 #修改原始对象 默认返回新对象
>>> df.fillna({1:0.5,3:-1},inplace=True)
0 1 2
0 0.581403 0.500000 NaN
1 -1.709160 0.500000 NaN
2 2.496074 0.500000 NaN
3 0.329339 0.500000 0.736299
4 -0.638106 0.500000 0.756044
5 0.519277 1.182077 -0.500918
6 -0.050867 -0.051302 1.368309
>>> df
0 1 2
0 0.581403 0.500000 NaN
1 -1.709160 0.500000 NaN
2 2.496074 0.500000 NaN
3 0.329339 0.500000 0.736299
4 -0.638106 0.500000 0.756044
5 0.519277 1.182077 -0.500918
6 -0.050867 -0.051302 1.368309 >>> info=DataFrame(np.random.randn(6,3))
>>> info.ix[:2,1]=NA;info.ix[4:,2]=NA
>>> info
0 1 2
0 1.217480 NaN 0.479981
1 -2.104463 NaN -2.917539
2 -2.141440 NaN -1.371574
3 0.925971 1.697813 0.814347
4 -1.463290 -0.526497 NaN
5 -0.300475 0.839098 NaN
#可以限制行数
>>> info.fillna(method='bfill',limit=1)
0 1 2
0 1.217480 NaN 0.479981
1 -2.104463 NaN -2.917539
2 -2.141440 1.697813 -1.371574
3 0.925971 1.697813 0.814347
4 -1.463290 -0.526497 NaN
5 -0.300475 0.839098 NaN #层次索引
>>> data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
>>> data
a 1 1.148945
2 -0.489120
3 1.151546
b 1 0.840938
2 -1.992375
3 0.039002
c 1 2.157531
2 0.963063
d 2 0.130796
3 0.012320
dtype: float64
>>> data.index
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
>>> data['b']
1 0.840938
2 -1.992375
3 0.039002
dtype: float64
>>> data['b':'c']
b 1 0.840938
2 -1.992375
3 0.039002
c 1 2.157531
2 0.963063
dtype: float64
>>> data.ix[['b','d']]
b 1 0.840938
2 -1.992375
3 0.039002
d 2 0.130796
3 0.012320
dtype: float64 >>> data[:,2]
a -0.489120
b -1.992375
c 0.963063
d 0.130796
dtype: float64 #转换成dataframe
>>> data.unstack()
1 2 3
a 1.148945 -0.489120 1.151546
b 0.840938 -1.992375 0.039002
c 2.157531 0.963063 NaN
d NaN 0.130796 0.012320 >>> data.unstack().stack()
a 1 1.148945
2 -0.489120
3 1.151546
b 1 0.840938
2 -1.992375
3 0.039002
c 1 2.157531
2 0.963063
d 2 0.130796
3 0.012320
dtype: float64 >>> frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['good','good','bad'],['G','R','G']])
>>> frame
good bad
G R G
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11 >>> frame.index.names=['key1','key2']
>>> frame.columns.names=['s','c']
>>> frame
s good bad
c G R G
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11 >>> frame['good']
c G R
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10 #重排分级顺序
>>> frame.swaplevel('key1','key2')
good bad
G R G
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
>>> frame.sortlevel(1)
state good bad
color G R G
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
>>> frame.swaplevel(0,1).sortlevel(0)
state good bad
color G R G
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
#根据层次汇总
>>> frame.sum(level='key2')
state good bad
color G R G
key2
1 6 8 10
2 12 14 16
>>> frame.sum(level='color',axis=1)
color G R
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10 #使用DataFrame的列
>>> frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
>>> frame
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
>>> frame2=frame.set_index(['c','d'])
>>> frame2
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
>>> frame2=frame.set_index(['c','d'],drop=False)
>>> frame2
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3 ##############读取文件################
>>> os.system('cat /Users/similarface/Downloads/jnn.csv')
准考证号,姓名,班级,语文,数学,英语,化学,物理
304040250124,罗茜,1,101,94,102.5,79,74
304040250128,沈怡君,1,91.5,96,69,82,69
304040250321,魏华,2,74,28,42,56,56
304040250233,何仕林,2,60.5,42,34.5,49,46
304040250725,屈妮,5,93.5,63,77.5,55,66
304040250709,邓培蓓,5,102.5,81,47,65,58
304040250805,郑清霞,5,89,80,63.5,63,65
304040250827,明杨,6,108.5,92,79,89,83
304040250819,李倩,6,93.5,61,44,45,32
304040250912,江明悦,6,0,0,0,0,00
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',name>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv')
准考证号 姓名 班级 语文 数学 英语 化学 物理
0 304040250124 罗茜 1 101.0 94 102.5 79 74
1 304040250128 沈怡君 1 91.5 96 69.0 82 69
2 304040250321 魏华 2 74.0 28 42.0 56 56
3 304040250233 何仕林 2 60.5 42 34.5 49 46
4 304040250725 屈妮 5 93.5 63 77.5 55 66
5 304040250709 邓培蓓 5 102.5 81 47.0 65 58
6 304040250805 郑清霞 5 89.0 80 63.5 63 65
7 304040250827 明杨 6 108.5 92 79.0 89 83
8 304040250819 李倩 6 93.5 61 44.0 45 32
9 304040250912 江明悦 6 0.0 0 0.0 0 0
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号')
姓名 班级 语文 数学 英语 化学 物理
准考证号
304040250124 罗茜 1 101.0 94 102.5 79 74
304040250128 沈怡君 1 91.5 96 69.0 82 69
304040250321 魏华 2 74.0 28 42.0 56 56
304040250233 何仕林 2 60.5 42 34.5 49 46
304040250725 屈妮 5 93.5 63 77.5 55 66
304040250709 邓培蓓 5 102.5 81 47.0 65 58
304040250805 郑清霞 5 89.0 80 63.5 63 65
304040250827 明杨 6 108.5 92 79.0 89 83
304040250819 李倩 6 93.5 61 44.0 45 32
304040250912 江明悦 6 0.0 0 0.0 0 0 #数量不定的空白符分割
>>> result=pd.read_table('ext3.txt',sep='\s+') #忽略的行数
>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号',skiprows=[5,9])
姓名 班级 语文 数学 英语 化学 物理
准考证号
304040250124 罗茜 1 101.0 94 102.5 79 74
304040250128 沈怡君 1 91.5 96 69.0 82 69
304040250321 魏华 2 74.0 28 42.0 56 56
304040250233 何仕林 2 60.5 42 34.5 49 46
304040250709 邓培蓓 5 102.5 81 47.0 65 58
304040250805 郑清霞 5 89.0 80 63.5 63 65
304040250827 明杨 6 108.5 92 79.0 89 83
304040250912 江明悦 6 0.0 0 0.0 0 0 #缺失值的填充
NA -1.#IND NULL
>>> os.system('cat /Users/similarface/Downloads/ex5.csv')
something,a,b,c,d,message
one,1,2,IND,4,NA
tow,-1,-1,,8,world
three,.,10,11,NULL,foo
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['NULL'])
something a b c d message
0 one 1 2 IND 4.0 NaN
1 tow -1 -1 NaN 8.0 world
2 three . 10 11 NaN foo
#指定空值
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['-1'])
something a b c d message
0 one 1 2.0 IND 4.0 NaN
1 tow NaN NaN NaN 8.0 world
2 three . 10.0 11 NaN foo
>>> sentinels={'message':['foo','NA'],'something':['tow']}
>>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=sentinels)
something a b c d message
0 one 1 2 IND 4.0 NaN
1 NaN -1 -1 NaN 8.0 world
2 three . 10 11 NaN NaN '''
filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO)
The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv
sep : str, default ‘,’
Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted and will force use of the python parsing engine and will ignore quotes in the data.
delimiter : str, default None
Alternative argument name for sep.
header : int or list of ints, default ‘infer’
Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no names passed, otherwise None. Explicitly pass header=0 to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True, so header=0 denotes the first line of data rather than the first line of the file.
names : array-like, default None
List of column names to use. If file contains no header row, then you should explicitly pass header=None
index_col : int or sequence or False, default None
Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names)
usecols : array-like, default None
Return a subset of the columns. Results in much faster parsing time and lower memory usage.
squeeze : boolean, default False
If the parsed data only contains one column then return a Series
prefix : str, default None
Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, ...
mangle_dupe_cols : boolean, default True
Duplicate columns will be specified as ‘X.0’...’X.N’, rather than ‘X’...’X’
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} (Unsupported with engine=’python’). Use str or object to preserve and not interpret dtype.
engine : {‘c’, ‘python’}, optional
Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can either be integers or column labels
true_values : list, default None
Values to consider as True
false_values : list, default None
Values to consider as False
skipinitialspace : boolean, default False
Skip spaces after delimiter.
skiprows : list-like or integer, default None
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file
skipfooter : int, default 0
Number of lines at bottom of file to skip (Unsupported with engine=’c’)
nrows : int, default None
Number of rows of file to read. Useful for reading pieces of large files
na_values : str or list-like or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they’re appended to.
na_filter : boolean, default True
Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file
verbose : boolean, default False
Indicate number of NA values placed in non-numeric columns
skip_blank_lines : boolean, default True
If True, skip over blank lines rather than interpreting as NaN values
parse_dates : boolean or list of ints or names or list of lists or dict, default False
boolean. If True -> try parsing the index.
list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.
list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’
Note: A fast-path exists for iso8601-formatted dates.
infer_datetime_format : boolean, default False
If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing
keep_date_col : boolean, default False
If True and parse_dates specifies combining multiple columns then keep the original columns.
date_parser : function, default None
Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. Pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments.
dayfirst : boolean, default False
DD/MM format dates, international and European format
iterator : boolean, default False
Return TextFileReader object for iteration or getting chunks with get_chunk().
chunksize : int, default None
Return TextFileReader object for iteration. See IO Tools docs for more information on iterator and chunksize.
compression : {‘infer’, ‘gzip’, ‘bz2’, None}, default ‘infer’
For on-the-fly decompression of on-disk data. If ‘infer’, then use gzip or bz2 if filepath_or_buffer is a string ending in ‘.gz’ or ‘.bz2’, respectively, and no decompression otherwise. Set to None for no decompression.
thousands : str, default None
Thousands separator
decimal : str, default ‘.’
Character to recognize as decimal point (e.g. use ‘,’ for European data).
lineterminator : str (length 1), default None
Character to break file into lines. Only valid with C parser.
quotechar : str (length 1), optional
The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.
quoting : int or csv.QUOTE_* instance, default None
Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior.
escapechar : str (length 1), default None
One-character string used to escape delimiter when quoting is QUOTE_NONE.
comment : str, default None
Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as skip_blank_lines=True), fully commented lines are ignored by the parameter header but not by skiprows. For example, if comment=’#’, parsing ‘#emptyna,b,cn1,2,3’ with header=0 will result in ‘a,b,c’ being treated as the header.
encoding : str, default None
Encoding to use for UTF when reading/writing (ex. ‘utf-8’). List of Python standard encodings
dialect : str or csv.Dialect instance, default None
If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details
tupleize_cols : boolean, default False
Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns)
error_bad_lines : boolean, default True
Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
warn_bad_lines : boolean, default True
If error_bad_lines is False, and warn_bad_lines is True, a warning for each “bad line” will be output. (Only valid with C parser).
'''
#数据写入
data.to_csv('文件名/sys.stout',sep='|',index=True/False,headers=TRUE/FALSE,cols=[选取的列])
#数据库操作
import pandas as pd
from pandas import *
import sqlite3
query="""
create table test(
a varchar(20),b VARCHAR(20),c REAL ,d INTEGER
);
"""
con=sqlite3.connect(':memory')
con.execute(query)
con.commit()
data=[('Atlanta','Georgia',1.25,6),
('Tallahassee','Florida',2.6,3),
('Sacramento','California',1.7,5)
]
stmt="INSERT INTO test VALUES (?,?,?,?)"
con.executemany(stmt,data)
con.commit()
cursor=con.execute('select * from test')
rows=cursor.fetchall()
DataFrame(rows,columns=zip(*cursor.description)[0])
#直接写sql读取dataFrame
import pandas.io.sql as sql
sql.read_sql('select * from test',con) #合并数据集
>>> df1 = DataFrame(
... {'key': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],
... 'major0': ['计算机','生物','化学','物理','医学']
... })
>>> df2 = DataFrame(
... {'key': ['北京大学', '四川大学', '云南大学'],
... 'major1': ['外国语', '口腔', '旅游']
... })
>>> df1
key major0
0 北京大学 计算机
1 四川大学 生物
2 天津大学 化学
3 山东大学 物理
4 清华大学 医学
>>> df2
key major1
0 北京大学 外国语
1 四川大学 口腔
2 云南大学 旅游 >>> pd.merge(df1,df2)
key major0 major1
0 北京大学 计算机 外国语
1 四川大学 生物 口腔 >>> df3 = DataFrame(
... {'lkey': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],
... 'major0': ['计算机','生物','化学','物理','医学']
... })
>>> df4 = DataFrame(
... {'rkey': ['北京大学', '四川大学', '云南大学'],
... 'major1': ['外国语', '口腔', '旅游']
... }) >>> df3
lkey major0
0 北京大学 计算机
1 四川大学 生物
2 天津大学 化学
3 山东大学 物理
4 清华大学 医学
>>> df4
major1 rkey
0 外国语 北京大学
1 口腔 四川大学
2 旅游 云南大学 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey')
lkey major0 major1 rkey
0 北京大学 计算机 外国语 北京大学
1 四川大学 生物 口腔 四川大学
#外连接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer')
lkey major0 major1 rkey
0 北京大学 计算机 外国语 北京大学
1 四川大学 生物 口腔 四川大学
2 天津大学 化学 NaN NaN
3 山东大学 物理 NaN NaN
4 清华大学 医学 NaN NaN
5 NaN NaN 旅游 云南大学
#左连接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left')
lkey major0 major1 rkey
0 北京大学 计算机 外国语 北京大学
1 四川大学 生物 口腔 四川大学
2 天津大学 化学 NaN NaN
3 山东大学 物理 NaN NaN
4 清华大学 医学 NaN NaN
#右连接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right')
lkey major0 major1 rkey
0 北京大学 计算机 外国语 北京大学
1 四川大学 生物 口腔 四川大学
2 NaN NaN 旅游 云南大学
#内连接
>>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='inner')
lkey major0 major1 rkey
0 北京大学 计算机 外国语 北京大学
1 四川大学 生物 口腔 四川大学 #多个键进行合并 left=DataFrame({
'key1':['foo','foo','bar'],
'key2':['one','two','one'],
'lval':[1,2,3]
}) right=DataFrame({
'key1':['foo','foo','bar','bar'],
'key2':['one','one','one','two'],
'lval':[4,5,6,7]
}) >>> pd.merge(left,right,on=['key1','key2'],how='outer')
key1 key2 lval_x lval_y
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0 #重复列名的处理
>>> pd.merge(left,right,on='key1',suffixes=('_lef','_right'))
key1 key2_lef lval_lef key2_right lval_right
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7 #索引上的合并
>>> right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])
>>> left1=DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
#合并根据索引对比
>>> pd.merge(left1,right1,left_on='key',right_index=True)
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0 lefth=DataFrame(
{'key1':['similar','similar','similar','face','face'],
'key2':[2000,2001,2002,2001,2002],
'data':np.arange(5.)
}) righth=DataFrame(np.arange(12).reshape((6,2)),
index=[['face','face','similar','similar','similar','similar'],
[2001,2000,2000,2000,2001,2002]
],
columns=['event1','event2']
)
>>> lefth
data key1 key2
0 0.0 similar 2000
1 1.0 similar 2001
2 2.0 similar 2002
3 3.0 face 2001
4 4.0 face 2002
>>> righth
event1 event2
face 2001 0 1
2000 2 3
similar 2000 4 5
2000 6 7
2001 8 9
2002 10 11 >>> pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
data key1 key2 event1 event2
0 0.0 similar 2000 4 5
0 0.0 similar 2000 6 7
1 1.0 similar 2001 8 9
2 2.0 similar 2002 10 11
3 3.0 face 2001 0 1 >>> left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['similar','face'])
>>> left2
similar face
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0
>>> right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['M','A'])
>>> right2
M A
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0 >>> pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
similar face M A
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
>>> left2.join(right2,how='outer')
similar face M A
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
>>> another=DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['NK','O'])
>>> left2.join([right2,another])
similar face M A NK O
a 1.0 2.0 NaN NaN 7 8
c 3.0 4.0 9.0 10.0 9 10
e 5.0 6.0 13.0 14.0 11 12 #轴向连接
>>> arr=np.arange(12).reshape((3,4))
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>> np.concatenate([arr,arr],axis=1)
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
>>> s1=Series([0,1],index=['a','b'])
>>> s2=Series([2,3,4],index=['c','d','e'])
>>> s3=Series([5,6],index=['f','g'])
>>> s1
a 0
b 1
dtype: int64
>>> s2
c 2
d 3
e 4
dtype: int64
>>> s3
f 5
g 6
dtype: int64
>>> pd.concat([s1,s2,s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
>>> pd.concat([s1,s2,s3,s1])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
a 0
b 1
dtype: int64
>>> pd.concat([s1,s2,s3,s1],axis=1)
0 1 2 3
a 0.0 NaN NaN 0.0
b 1.0 NaN NaN 1.0
c NaN 2.0 NaN NaN
d NaN 3.0 NaN NaN
e NaN 4.0 NaN NaN
f NaN NaN 5.0 NaN
g NaN NaN 6.0 NaN df1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
>>> pd.concat([df1,df2],axis=1,keys=['level1','level2'])
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
>>> pd.concat({'level1':df1,'level2':df2},axis=1)
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
>>> pd.concat([df1,df2],axis=1,keys=['L1','L2'],names=['u','l'])
u L1 L2
l one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
>>> df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
>>> df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
>>> df1
a b c d
0 -1.487358 0.077565 0.209403 -0.712507
1 1.990047 -0.221415 1.381161 -0.876811
2 -0.153150 0.391847 1.180728 -0.972548
>>> df2
b d a
0 -0.200611 0.321759 -0.201620
1 -1.842735 -1.924933 0.281712 >>> pd.concat([df1,df2])
a b c d
0 -1.487358 0.077565 0.209403 -0.712507
1 1.990047 -0.221415 1.381161 -0.876811
2 -0.153150 0.391847 1.180728 -0.972548
0 -0.201620 -0.200611 NaN 0.321759
1 0.281712 -1.842735 NaN -1.924933 >>> pd.concat([df1,df2],ignore_index=True)
a b c d
0 -1.487358 0.077565 0.209403 -0.712507
1 1.990047 -0.221415 1.381161 -0.876811
2 -0.153150 0.391847 1.180728 -0.972548
3 -0.201620 -0.200611 NaN 0.321759
4 0.281712 -1.842735 NaN -1.924933 >>> pd.concat([df1,df2],ignore_index=True,axis=1)
0 1 2 3 4 5 6
0 -1.487358 0.077565 0.209403 -0.712507 -0.200611 0.321759 -0.201620
1 1.990047 -0.221415 1.381161 -0.876811 -1.842735 -1.924933 0.281712
2 -0.153150 0.391847 1.180728 -0.972548 NaN NaN NaN >>> b[:-2]
f 0.0
e 1.0
d 2.0
c 3.0
dtype: float64
>>> a[2:]
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
>>> b[:-2].combine_first(a[2:])
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64 >>> df1=DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)})
>>> df2=DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})
>>> df2
a b
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0
>>> df1
a b c
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 6.0 14
>>> df1.combine_first(df2)
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN #重塑和轴向旋转
>>> data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['similar','face'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
>>> data
number one two three
state
similar 0 1 2
face 3 4 5
>>> data.stack()
state number
similar one 0
two 1
three 2
face one 3
two 4
three 5
dtype: int64
>>> data.stack().unstack()
number one two three
state
similar 0 1 2
face 3 4 5 >>> data.stack().unstack(0)
state similar face
number
one 0 3
two 1 4
three 2 5 >>> data.stack().unstack('state')
state similar face
number
one 0 3
two 1 4
three 2 5 >>> s1=Series([0,1,2,3],index=['a','b','c','d'])
>>> s2=Series([4,5,6],index=['c','d','e'])
>>> s1
a 0
b 1
c 2
d 3
dtype: int64
>>> s2
c 4
d 5
e 6
dtype: int64
>>> pd.concat([s1,s2],keys=['one','two'])
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
>>> pd.concat([s1,s2],keys=['one','two']).unstack()
a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0
>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack()
one a 0.0
b 1.0
c 2.0
d 3.0
two c 4.0
d 5.0
e 6.0
dtype: float64
>>> pd.concat([s1,s2],keys=['one','two']).unstack().stack(dropna=False)
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64 #利用函数进行数据转换 data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}) meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
} data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
>>> data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon >>> data['food'].map(lambda x: meat_to_animal[x.lower()])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object 离散化和面元划分:
#指定组名称
>>> group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
>>> pd.cut(ages,bins,labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
#等长面元 将下面的随机数分4段 precision小数点位数
>>> data=np.random.rand(20)
>>> data
array([ 0.42519089, 0.18981873, 0.29726754, 0.37843724, 0.31072184,
0.20240683, 0.99244468, 0.61880299, 0.9948212 , 0.32893834,
0.87701908, 0.25638677, 0.02344737, 0.15162624, 0.31874342,
0.16534997, 0.43495775, 0.83059911, 0.57975644, 0.53763544])
>>> pd.cut(data,4,precision=2)
[(0.27, 0.51], (0.022, 0.27], (0.27, 0.51], (0.27, 0.51], (0.27, 0.51], ..., (0.022, 0.27], (0.27, 0.51], (0.75, 0.99], (0.51, 0.75], (0.51, 0.75]]
Length: 20
Categories (4, object): [(0.022, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]]
#分段求值
>>> pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
#左闭右开
pd.cut(ages, [18, 26, 36, 61, 100], right=False) #检查、过滤异常值
>>> np.random.seed(12345)
>>> data=DataFrame(np.random.randn(1000,4))
>>> data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
>>> col=data[3]
>>> col[np.abs(col)>3]
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64 #随机重排序
>>> sampler=np.random.permutation(5)
>>> df.take(sampler)
0 1 2 3
4 16 17 18 19
2 8 9 10 11
1 4 5 6 7
3 12 13 14 15
0 0 1 2 3 >>> df.take(np.random.permutation(len(df))[:3])
0 1 2 3
1 4 5 6 7
2 8 9 10 11
0 0 1 2 3 #给定数组的值生成大集合
>>> bag=np.array([5,7,-1,6,4])
>>> sampler=np.random.randint(0,len(bag),size=10)
>>> sampler
array([1, 0, 4, 1, 2, 1, 4, 4, 3, 4])
>>> draws=bag.take(sampler)
>>> draws
array([ 7, 5, 4, 7, -1, 7, 4, 4, 6, 4]) #哑变量矩阵 和 指标矩阵
@某一列出现与否的矩阵
>>> df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
>>> df
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 b
>>> pd.get_dummies(df['key'])
a b c
0 0.0 1.0 0.0
1 0.0 1.0 0.0
2 1.0 0.0 0.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
5 0.0 1.0 0.0 #
>>> dummies=pd.get_dummies(df['key'],prefix='key')
>>> dummies
key_a key_b key_c
0 0.0 1.0 0.0
1 0.0 1.0 0.0
2 1.0 0.0 0.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
5 0.0 1.0 0.0
>>> df_with_dummy=df[['data1']].join(dummies)
>>> df_with_dummy
data1 key_a key_b key_c
0 0 0.0 1.0 0.0
1 1 0.0 1.0 0.0
2 2 1.0 0.0 0.0
3 3 0.0 0.0 1.0
4 4 1.0 0.0 0.0
5 5 0.0 1.0 0.0 >>> values
array([ 0.86789062, 0.4187927 , 0.48191735, 0.44540277, 0.6855452 ,
0.33193716, 0.20772778, 0.21461227, 0.50985294, 0.95327048])
>>>
>>> bins=[0,0.2,0.4,0.6,0.8,1]
>>> pd.get_dummies(pd.cut(values,bins))
(0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1]
0 0.0 0.0 0.0 0.0 1.0
1 0.0 0.0 1.0 0.0 0.0
2 0.0 0.0 1.0 0.0 0.0
3 0.0 0.0 1.0 0.0 0.0
4 0.0 0.0 0.0 1.0 0.0
5 0.0 1.0 0.0 0.0 0.0
6 0.0 1.0 0.0 0.0 0.0
7 0.0 1.0 0.0 0.0 0.0
8 0.0 0.0 1.0 0.0 0.0
9 0.0 0.0 0.0 0.0 1.0 #电子邮件正则
>>> pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
>>> regex=re.compile(pattern,flags=re.IGNORECASE)
>>> regex.match('jaflfbs@sina.com')
<_sre.SRE_Match object at 0x111ceab78>
>>> m=regex.match('jaflfbs@sina.com')
>>> m.groups()
('jaflfbs', 'sina', 'com') #分组 group by groupby
>>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','tow','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
>>> df
data1 data2 key1 key2
0 -0.893905 0.311668 a one
1 1.274761 0.885820 a two
2 1.115914 0.887069 b one
3 0.054165 0.267643 b tow
4 -0.819516 0.933495 a one
>>> grouped=df['data1'].groupby(df['key1'])
>>> grouped
<pandas.core.groupby.SeriesGroupBy object at 0x111e11e10>
>>> grouped.mean()
key1
a -0.14622
b 0.58504
Name: data1, dtype: float64
>>> means=df['data1'].groupby([df['key1'],df['key2']]).mean()
>>> means
key1 key2
a one -0.856710
two 1.274761
b one 1.115914
tow 0.054165
Name: data1, dtype: float64 >>> means.unstack()
key2 one tow two
key1
a -0.856710 NaN 1.274761
b 1.115914 0.054165 NaN #可以具体制定 分组的列
>>> states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
>>> years = np.array([2005, 2005, 2006, 2005, 2006])
>>> df['data1'].groupby([states,years]).mean()
#分组的可以是列名 key2没有出现 因为key2不是数值类型的
>>> df.groupby('key1').mean()
data1 data2
key1
a -0.14622 0.710328
b 0.58504 0.577356 >>> df.groupby(['key1','key2']).mean()
data1 data2
key1 key2
a one -0.856710 0.622582
two 1.274761 0.885820
b one 1.115914 0.887069
tow 0.054165 0.267643 #获取分组的大小
>>> df.groupby(['key1','key2']).size()
key1 key2
a one 2
two 1
b one 1
tow 1 #
>>> pieces=dict(list(df.groupby('key1')))
>>> pieces['b']
data1 data2 key1 key2
2 1.115914 0.887069 b one
3 0.054165 0.267643 b tow ############时间操作
>>> from datetime import datetime
>>> now=datetime.now()
>>> now
datetime.datetime(2016, 4, 12, 14, 31, 50, 995484)
>>> now.year,now.month,now.day
(2016, 4, 12)
>>> now.day
12
>>> #delta以毫秒形式存储日期和时间 datetime.timedelta表示lia
>>> delta=datetime(2016,5,1)-datetime(2016,5,2)
>>> delta
datetime.timedelta(-1)
>>> delta.days
-1
>>> delta.seconds
0
>>> from datetime import timedelta
>>> start=datetime(2011,1,1)
>>> start+timedelta(12)
datetime.datetime(2011, 1, 13, 0, 0)
>>> start-2*timedelta(12)
datetime.datetime(2010, 12, 8, 0, 0)
>>> stamp=datetime(2011,1,3)
>>> str(stamp)
'2011-01-03 00:00:00'
>>> value='2016-01-01'
>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2016, 1, 1, 0, 0)
>>> value='2016-01-13'
>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2016, 1, 13, 0, 0)
>>> value='2016-13-13'
>>> datetime.strptime(value,'%Y-%m-%d')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_strptime.py", line 325, in _strptime
(data_string, format))
ValueError: time data '2016-13-13' does not match format '%Y-%m-%d >>> datestrs=['7/6/2016','1/1/1111']
>>> [datetime.strptime(x,'%m/%d/%Y') for x in datestrs]
[datetime.datetime(2016, 7, 6, 0, 0), datetime.datetime(1111, 1, 1, 0, 0)] >>> from dateutil.parser import parse
>>> parse('2016-01-09')
datetime.datetime(2016, 1, 9, 0, 0)
>>> parse('Jan 31,2015 10:31 PM')
datetime.datetime(2015, 1, 31, 22, 31)
>>> parse('1/3/2018',dayfirst=True)
datetime.datetime(2018, 3, 1, 0, 0)
>>> parse('1/3/2018',dayfirst=False)
datetime.datetime(2018, 1, 3, 0, 0) >>> datestrs=['1/4/2016','4/1/2017']
>>> pd.to_datetime(datestrs)
DatetimeIndex(['2016-01-04', '2017-04-01'], dtype='datetime64[ns]', freq=None)
>>> idx=pd.to_datetime(datestrs+[None])
>>> idx
DatetimeIndex(['2016-01-04', '2017-04-01', 'NaT'], dtype='datetime64[ns]', freq=None) >>> pd.isnull(idx)
array([False, False, True], dtype=bool) >>> dates=[datetime(2011,1,2),datetime(2016,1,1),datetime(2016,1,2),datetime(2016,1,3),datetime(2016,1,4),datetime(2016,1,5)]
>>> dates
[datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)]
>>> from pandas import *
>>> ts=Series(np.random.randn(6),index=dates)
>>> ts
2011-01-02 0.734018
2016-01-01 1.661590
2016-01-02 0.839504
2016-01-03 -1.295834
2016-01-04 0.190545
2016-01-05 0.267724
dtype: float64 >>> ts+ts[::2]
2011-01-02 1.468037
2016-01-01 NaN
2016-01-02 1.679008
2016-01-03 NaN
2016-01-04 0.381091
2016-01-05 NaN
dtype: float64 >>> ts.index.dtype
dtype('<M8[ns]')
>>> stamp=ts.index[0]
>>> stamp
Timestamp('2011-01-02 00:00:00')
>>> stamp=ts.index[2]
>>> ts[stamp]
0.83950398236998658
>>> ts['1/1/2016']
1.6615901161098698 >>> longer_ts=Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
>>> longer_ts['2002-09-21':'2002-09-23']
2002-09-21 -0.105898
2002-09-22 1.708342
2002-09-23 -0.815799
Freq: D, dtype: float64
>>> longer_ts['2002-09-21':'09/23/2002']
2002-09-21 -0.105898
2002-09-22 1.708342
2002-09-23 -0.815799
Freq: D, dtype: float64
>>> longer_ts['2002-09-21':'23/09/2002']
2002-09-21 -0.105898
2002-09-22 1.708342
2002-09-23 -0.815799
Freq: D, dtype: float64 >>> longer_ts.truncate(before='2002-09-23')
2002-09-23 -0.815799
2002-09-24 -0.140892
2002-09-25 -0.397591
2002-09-26 0.451815
Freq: D, dtype: float64
>>> longer_ts.truncate(after='2002-09-23') #重复时间序列 >>> dates=pd.DatetimeIndex(['1/1/2016','1/2/2016','1/2/2016','1/2/2016','1/3/2016'])
>>> dates
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-02', '2016-01-02',
'2016-01-03'],
dtype='datetime64[ns]', freq=None)
>>> dup_ts=Series(range(5),index=dates)
>>> dup_ts
2016-01-01 0
2016-01-02 1
2016-01-02 2
2016-01-02 3
2016-01-03 4
dtype: int64
>>> dup_ts.index.is_unique
False
>>> dup_ts[]
File "<stdin>", line 1
dup_ts[]
^
SyntaxError: invalid syntax
>>> dup_ts['1/2/2016']
2016-01-02 1
2016-01-02 2
2016-01-02 3
dtype: int64
>>> grouped=dup_ts.groupby(level=0)
>>> grouped.mean()
2016-01-01 0
2016-01-02 2
2016-01-03 4
dtype: int64
>>> grouped.max()
2016-01-01 0
2016-01-02 3
2016-01-03 4
dtype: int64
>>> grouped.count()
2016-01-01 1
2016-01-02 3
2016-01-03 1
dtype: int64 #4-6月的日期
>>> index=pd.date_range('4/1/2016','6/1/2016')
#开始 向后多少天
>>> pd.date_range(start='4/1/2016',periods=20)
DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
'2016-04-05', '2016-04-06', '2016-04-07', '2016-04-08',
'2016-04-09', '2016-04-10', '2016-04-11', '2016-04-12',
'2016-04-13', '2016-04-14', '2016-04-15', '2016-04-16',
'2016-04-17', '2016-04-18', '2016-04-19', '2016-04-20'],
dtype='datetime64[ns]', freq='D') >>> pd.date_range(end='2016-12-12',periods=10)
DatetimeIndex(['2016-12-03', '2016-12-04', '2016-12-05', '2016-12-06',
'2016-12-07', '2016-12-08', '2016-12-09', '2016-12-10',
'2016-12-11', '2016-12-12'],
dtype='datetime64[ns]', freq='D') >>> pd.date_range('1/1/2016','12/2/2016',freq='BM')
DatetimeIndex(['2016-01-29', '2016-02-29', '2016-03-31', '2016-04-29',
'2016-05-31', '2016-06-30', '2016-07-29', '2016-08-31',
'2016-09-30', '2016-10-31', '2016-11-30'],
dtype='datetime64[ns]', freq='BM') >>> pd.date_range('5/2/2012 12:12:12',periods=5)
DatetimeIndex(['2012-05-02 12:12:12', '2012-05-03 12:12:12',
'2012-05-04 12:12:12', '2012-05-05 12:12:12',
'2012-05-06 12:12:12'],
dtype='datetime64[ns]', freq='D')
#normalize 午夜12点
>>> pd.date_range('5/2/2016 12:13:14',periods=5,normalize=True)
DatetimeIndex(['2016-05-02', '2016-05-03', '2016-05-04', '2016-05-05',
'2016-05-06'],
dtype='datetime64[ns]', freq='D') >>> from pandas.tseries.offsets import Hour,Minute
>>> hour=Hour
>>> hour
<class 'pandas.tseries.offsets.Hour'>
>>> four_hours=Hour(4)
>>> four_hours
<4 * Hours>
>>>
>>> pd.date_range('1/1/2016','1/2/2016',freq='4h')
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 04:00:00',
'2016-01-01 08:00:00', '2016-01-01 12:00:00',
'2016-01-01 16:00:00', '2016-01-01 20:00:00',
'2016-01-02 00:00:00'],
dtype='datetime64[ns]', freq='4H') >>> pd.date_range('1/1/2000',periods=2,freq='1h30min')
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00'], dtype='datetime64[ns]', freq='90T') freq
-----------------------------
http://pandas.pydata.org/pandas-docs/version/0.18.0/timeseries.html#dateoffset-objects
-----------------------------
D 每日
B 工作日
H 小数
T 分钟
S 秒
L 毫秒
U 微妙
M 每月最后一天
BM 每月最后一个工作日
MS 每月第一个
BMS 每月工作第一天
W-MON W-TUE[WED THU FRI SAT SUN]
WOM-1MON WOM-2MON 每月第一个星期一 。。。
Q-JAN 月份 JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
BQ-JAN AS-JAN 每年指定月份的第一个日历日
BAS-JAN BAS-FEB 每年指定月份的第一个工作日 >>> rng=pd.date_range('1/1/2016','9/1/2012',freq='WOM-3FRI')
>>> rng
DatetimeIndex([], dtype='datetime64[ns]', freq='WOM-3FRI')
>>> rng=pd.date_range('1/1/2016','9/1/2016',freq='WOM-3FRI')
>>> rng
DatetimeIndex(['2016-01-15', '2016-02-19', '2016-03-18', '2016-04-15',
'2016-05-20', '2016-06-17', '2016-07-15', '2016-08-19'],
dtype='datetime64[ns]', freq='WOM-3FRI') >>> ts=Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M'))
>>> ts
2000-01-31 0.246254
2000-02-29 0.426385
2000-03-31 0.832971
2000-04-30 1.163773
Freq: M, dtype: float64
>>> ts.shift(2)
2000-01-31 NaN
2000-02-29 NaN
2000-03-31 0.246254
2000-04-30 0.426385
Freq: M, dtype: float64
>>> ts.shift(-2)
2000-01-31 0.832971
2000-02-29 1.163773
2000-03-31 NaN
2000-04-30 NaN
Freq: M, dtype: float64 #计算百分比变化
>>> ts/ts.shift(1)-1
2000-01-31 NaN
2000-02-29 0.731486
2000-03-31 0.953564
2000-04-30 0.397135
Freq: M, dtype: float64 >>> ts.shift(2,freq='M')
2000-03-31 0.246254
2000-04-30 0.426385
2000-05-31 0.832971
2000-06-30 1.163773
Freq: M, dtype: float64 >>> ts.shift(3,freq='D')
2000-02-03 0.246254
2000-03-03 0.426385
2000-04-03 0.832971
2000-05-03 1.163773
dtype: float64 >>> ts.shift(1,freq='3D')
2000-02-03 0.246254
2000-03-03 0.426385
2000-04-03 0.832971
2000-05-03 1.163773
dtype: float64 >>> ts.shift(1,freq='90T')
2000-01-31 01:30:00 0.246254
2000-02-29 01:30:00 0.426385
2000-03-31 01:30:00 0.832971
2000-04-30 01:30:00 1.163773
Freq: M, dtype: float64 >>> from pandas.tseries.offsets import Day,MonthEnd
>>> now=datetime(2011,11,17)
>>> now
datetime.datetime(2011, 11, 17, 0, 0)
>>> now+3*Day()
Timestamp('2011-11-20 00:00:00')
>>> now+MonthEnd()
Timestamp('2011-11-30 00:00:00')
>>> now+MonthEnd(2)
Timestamp('2011-12-31 00:00:00') >>> offset=MonthEnd()
>>> offset.rollforward(now)
Timestamp('2011-11-30 00:00:00')
>>> now
datetime.datetime(2011, 11, 17, 0, 0)
>>> offset.rollback(now)
Timestamp('2011-10-31 00:00:00') >>> ts=Series(np.random.randn(20),index=pd.date_range('1/12/2016',periods=20,freq='4d'))5450>>
>>> ts.groupby(offset.rollforward).mean()
2016-01-31 -0.023515
2016-02-29 0.332412
2016-03-31 0.445600
dtype: float64 >>> ts.resample('M',how='mean')
2016-01-31 0.705208
2016-02-29 -0.174444
2016-03-31 0.534282
Freq: M, dtype: float64 #时间算术运算
>>> p=pd.Period(2016,freq='A-DEC')
>>> p
Period('2016', 'A-DEC')
>>> p+5
Period('2021', 'A-DEC')
>>> p-2
Period('2014', 'A-DEC')
>>> pd.Period('2014',freq='A-DEC')-p
-2
>>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
>>> rng
PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06'], dtype='int64', freq='M') >>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
>>> Series(np.random.randn(6),index=rng)
2016-01 -0.739693
2016-02 -0.928667
2016-03 0.176348
2016-04 1.343980
2016-05 -1.513816
2016-06 0.654137
Freq: M, dtype: float64 >>> values=['2010Q3','2012Q2','2013Q1']
>>> index=pd.PeriodIndex(values,freq='Q-DEC')
>>> index
PeriodIndex(['2010Q3', '2012Q2', '2013Q1'], dtype='int64', freq='Q-DEC') #时间频度转换
>>> p=pd.Period('2007',freq='A-DEC')
>>> p.asfreq('M',how='start')
Period('2007-01', 'M')
>>> p.asfreq('M',how='end')
Period('2007-12', 'M')
>>> p=pd.Period('2007',freq='A-FEB')
>>> p.asfreq('M',how='start')
Period('2006-03', 'M')
>>> p.asfreq('M',how='end')
Period('2007-02', 'M') #
#
#
#
#
#
#
#
#
#
#
#
#
#
#查询空列
fNull=full[full.End_y.isnull()]
pd.value_counts(cats) #isin
#isin 的使用
>>> p1299del[p1299del.Gene_symbol.isin(['TP53','EGFR'])].count()[1]
>>> p1299snp[p1299snp.Gene_symbol.isin(['TP53','EGFR'])].count()[1]
>>> p297['TUMOR']
44667 0/1:.:114:110:4:3.51%:26,84,3,1
44668 1/1:.:111:6:104:94.55%:3,3,25,79
44669 0/1:.:19:12:7:36.84%:5,7,1,6
Name: TUMOR, dtype: object >>> p297['TUMOR'].str.split(':').str.get(5).str.replace('%','')
44666 20.69
44667 3.51
44668 94.55
44669 36.84 #将新的列插入到dataframe中
>>> p297all=pd.concat([p297,s1],axis=1)
>>> p297.ix[1]
#CHROM chr1
POS 131114
ID .
REF C
ALT T
QUAL .
FILTER PASS
INFO DP=339;SS=1;SSC=13;GPV=1E0;SPV=4.0882E-2
FORMAT GT:GQ:DP:RD:AD:FREQ:DP4
NORMAL 0/1:.:69:67:2:2.9%:53,14,1,1
TUMOR 0/1:.:270:243:27:10%:188,55,20,7
Name: 1, dtype: object
>>> p297all.ix[1]
#CHROM chr1
POS 131114
ID .
REF C
ALT T
QUAL .
FILTER PASS
INFO DP=339;SS=1;SSC=13;GPV=1E0;SPV=4.0882E-2
FORMAT GT:GQ:DP:RD:AD:FREQ:DP4
NORMAL 0/1:.:69:67:2:2.9%:53,14,1,1
TUMOR 0/1:.:270:243:27:10%:188,55,20,7
Q 10[多的]
Name: 1, dtype: object
#列大于的集合
p297all[p297all.Q.ge('1')]
# if then
>>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
>>> df
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50 >>> df.ix[df.AAA>=5]
AAA BBB CCC
1 5 20 50
2 6 30 -30
3 7 40 -50
>>> df.ix[df.AAA>=5,'BBB']=-1
>>> df
AAA BBB CCC
0 4 10 100
1 5 -1 50
2 6 -1 -30
3 7 -1 -50
>>> df.ix[df.AAA>=5,['BBB','CCC']]=555
>>> df
AAA BBB CCC
0 4 10 100
1 5 555 555
2 6 555 555
3 7 555 555
>>> df.ix[df.AAA<5,['BBB','CCC']]=2000
>>> df
AAA BBB CCC
0 4 2000 2000
1 5 555 555
2 6 555 555
3 7 555 555
>>> df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2})
>>> df_mask
AAA BBB CCC
0 True False True
1 True False False
2 True False True
3 True False False
>>>
>>> df.where(df_mask,-1000)
AAA BBB CCC
0 4 -1000 2000
1 5 -1000 -1000
2 6 -1000 555
3 7 -1000 -1000 >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
>>> df
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50
>>>
>>> df['logic'] = np.where(df['AAA'] > 5,'high','low');
>>> df
AAA BBB CCC logic
0 4 10 100 low
1 5 20 50 low
2 6 30 -30 high
3 7 40 -50 high >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
>>> df
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50
>>> dflow = df[df.AAA <= 5]
>>> dfhigh = df[df.AAA > 5]
>>> dflow,dfhigh
( AAA BBB CCC
0 4 10 100
1 5 20 50, AAA BBB CCC
2 6 30 -30
3 7 40 -50)
>>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
>>> newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries
0 4
1 5
Name: AAA, dtype: int64
>>> df.loc[1]
AAA 5
BBB 20
CCC 50
Name: 1, dtype: int64
>>> newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40)]; newseries
AAA BBB CCC
0 4 10 100
1 5 20 50
>>> newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries;
0 4
1 5
2 6
3 7
Name: AAA, dtype: int64
#(df.CCC-50) 的绝对值 进行排序
>>> df.ix[(df.CCC-50).abs().argsort()]
AAA BBB CCC
1 5 20 50
0 4 10 100
2 6 30 -30
3 7 40 -50 Crit1 = df.AAA <= 5.5
Crit2 = df.BBB == 10.0
Crit3 = df.CCC > -40.0
AllCrit = Crit1 & Crit2 & Crit3
CritList = [Crit1,Crit2,Crit3]
AllCrit = functools.reduce(lambda x,y: x & y, CritList)
df[AllCrit] Out[27]:
AAA BBB CCC
0 4 10 100 #Selection >>> df[(df.AAA<=6)&(df.index.isin([0,2,4]))]
AAA BBB CCC
0 4 10 100
2 6 30 -30 >>> df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]
AAA BBB CCC
foo 4 10 100
bar 5 20 50
boo 6 30 -30
kar 7 40 -50 >>> rng = pd.date_range(start="2014-10-07",periods=10,freq='2min')
>>> ts = pd.Series(data = list(range(10)), index = rng)
#解释下 这个x 其事就是分组的list 比如[1,1,1,2,2,3,3,3,4,4] 分组x=[1,1,1] ...x=[4,4]
>>> def MyCust(x):
... if len(x)>2:
... return x[1]*2
... return pd.NaT
...
>>> mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust}
>>> ts.resample("5min").apply(mhc)
Max Custom Mean
2014-10-07 00:00:00 2 2 1.0
2014-10-07 00:05:00 4 NaT 3.5
2014-10-07 00:10:00 7 12 6.0
2014-10-07 00:15:00 9 NaT 8.5 >>> df['Counts'] = df.groupby(['Color']).transform(len)
>>> df
Color Value Counts
0 Red 100 3
1 Red 150 3
2 Red 50 3
3 Blue 50 1 >>> df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1)
>>> df
beyer line_race beyer_shifted
Last Gunfighter 99 10 NaN
Last Gunfighter 102 10 99.0
Last Gunfighter 103 8 102.0
Paynter 103 10 NaN
Paynter 88 10 103.0
Paynter 100 8 88.0
#如何替换 对应值
>>> data.ix[1:3]
barcode sex age rsid genotype proj question answer
1 111-1112-0082 女 27 rs17822931 CT 耳垢 你的耳垢类型是? 不清楚
2 111-1112-4110 男 38 rs17822931 CT 耳垢 你的耳垢类型是? 湿耳
3 111-1112-7043 男 33 rs17822931 TT 耳垢 你的耳垢类型是? 干耳
>>> data.loc[(data.sex==u'男'),'sex']='male'
>>> data.loc[(data.sex==u'女'),'sex']='female'
>>> data.ix[1:3]
barcode sex age rsid genotype proj question answer
1 111-1112-0082 female 27 rs17822931 CT 耳垢 你的耳垢类型是? 不清楚
2 111-1112-4110 male 38 rs17822931 CT 耳垢 你的耳垢类型是? 湿耳
3 111-1112-7043 male 33 rs17822931 TT 耳垢 你的耳垢类型是? 干耳
#sex[‘male’,’female’],而将其平展开为’sex_male’,’sex_female’两个属性
>>> data_sex = pd.get_dummies(data['sex'], prefix= 'sex')
>>> data_sex.ix[1:3]
sex_female sex_male
1 1.0 0.0
2 0.0 1.0
3 0.0 1.0
data.loc[(data.answer==u'干耳'),'answer']='dry'
data.loc[(data.answer==u'湿耳'),'answer']='wet'
#过滤 回答是 不清楚 的
filterdata=data[data.answer!=u'不清楚']
filterdata.ix[1:3]
barcode sex age rsid genotype proj question answer
2 111-1112-4110 male 38 rs17822931 CT 耳垢 你的耳垢类型是? wet
3 111-1112-7043 male 33 rs17822931 TT 耳垢 你的耳垢类型是? dry
dummies_answer = pd.get_dummies(filterdata['answer'], prefix= 'answer')
dummies_sex = pd.get_dummies(filterdata['sex'], prefix= 'sex')
dummies_genotype=pd.get_dummies(filterdata['genotype'], prefix= 'genotype')
filterdatafull=pd.concat([filterdata,dummies_answer,dummies_sex,dummies_genotype], axis=1)
filterdatafull.drop(['sex','rsid','genotype','answer','proj','question'], axis=1, inplace=True)
数据挖掘之pandas的更多相关文章
- 数据挖掘---Pandas的学习
Pandas介绍(panel + data + analysis) 为什么使用Pandas 便捷的数据处理能力 读取文件方便 封装了Matplotlib.Nu ...
- windows下数据挖掘相关包numpy、pandas的安装
安装Anaconda的绕道 这里介绍如何在windows下安装numpy/scipy/matplotlib/pandas/scikit_learn等数据分析相关包 相关环境: win7 64位 pyt ...
- 吴裕雄 数据挖掘与分析案例实战(4)——python数据处理工具:Pandas
# 导入模块import pandas as pdimport numpy as np # 构造序列gdp1 = pd.Series([2.8,3.01,8.99,8.59,5.18])print(g ...
- 1 python大数据挖掘系列之基础知识入门
preface Python在大数据行业非常火爆近两年,as a pythonic,所以也得涉足下大数据分析,下面就聊聊它们. Python数据分析与挖掘技术概述 所谓数据分析,即对已知的数据进行分析 ...
- Pandas中DateFrame修改列名
Pandas中DateFrame修改列名 在做数据挖掘的时候,想改一个DataFrame的column名称,所以就查了一下,总结如下: 数据如下: >>>import pandas ...
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)
原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...
- [resource-]Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
reference: http://www.52nlp.cn/python-%e7%bd%91%e9%a1%b5%e7%88%ac%e8%99%ab-%e6%96%87%e6%9c%ac%e5%a4% ...
- Ubuntu系统下创建python数据挖掘虚拟环境
虚拟环境: 虚拟环境是用于创建独立的python环境,允许我们使用不同的python模块和版本,而不混淆. 让我们了解一下产品研发过程中虚拟环境的必要性,在python项目中,显然经常要使用不 ...
- 我的Pandas应用场景(2)
上文交代了一些啰嗦事,本文开始,就要来点实际的了. 先来一个比较简单的场景: Given:一个包括N(极其复杂,这里取3个)个列的DataFrame:df,df包括index: And:对df所有列元 ...
随机推荐
- IOS YYKit 源码解析
https://blog.csdn.net/weixin_33874713/article/details/87034047
- 【转】linux之shfit
位置参数可以用shift命令左移.比如shift 3表示原来的$4现在变成$1,原来的$5现在变成$2等等,原来的$1.$2.$3丢弃,$0不移动.不带参数的shift命令相当于shift 1. 非常 ...
- poj 3693 Maximum repetition substring 重复次数最多的连续子串
题目链接 题意 对于任意的字符串,定义它的 重复次数 为:它最多可被划分成的完全相同的子串个数.例如:ababab 的重复次数为3,ababa 的重复次数为1. 现给定一字符串,求它的一个子串,其重复 ...
- [转]在Storyboard中使用自定义的segue类型
转自:http://my.oschina.net/u/728866/blog/92709 我们知道segue共有三种类型:push.modal以及custom.如下图: 很明显,这三种类型的作用分 ...
- 学习环境配置:Manjaro、MSYS2以及常见软件
0.前言 在说Manjaro之前,要先说一下Linux发行版.对于各大发行版而言,内核只有版本的差异,最重要的区别就是包管理系统.常见的包管理系统包括:Pacman,Apt , Yum和Portage ...
- uva 1149:Bin Packing(贪心)
题意:给定N物品的重量,背包容量M,一个背包最多放两个东西.问至少多少个背包. 思路:贪心,最大的和最小的放.如果这样都不行,那最大的一定孤独终生.否则,相伴而行. 代码: #include < ...
- LeetCode OJ-- Jump Game
https://oj.leetcode.com/problems/jump-game/ 从0开始,根据每一位上存的数值往前跳. 这道题给想复杂了... 记录当前位置 pos,记录可以调到的最远达位置为 ...
- python 安装cx_Oracle模块, MySQLdb模块, Tornado
一,想访问远程Oracle数据库,本地又不想安装几百兆的Oracle Client(也木有root权限),安装python的cx_Oralce 模块需要依赖Oracle Instant Client ...
- springboot 2.0.8 跳转jsp页面
springboot项目创建教程 https://blog.csdn.net/q18771811872/article/details/88126835 springboot 2.0跳转 html教程 ...
- 苹果iOS APP配置HTTPS,iOS ATS配置SSL,苹果ATS标准解决方案
参考沃通: