import numpy as np
import pandas as pd

Series:
#创建Series方法1
s1=pd.Series([1,2,3,4])
s1
# 0    1
# 1 2
# 2 3
# 3 4
# dtype: int64
s1.values#array([1, 2, 3, 4], dtype=int64)
s1.index#RangeIndex(start=0, stop=4, step=1)

  

#创建Series方法2
s2=pd.Series(np.arange(5,10))
print(s2)
# 0    5
# 1 6
# 2 7
# 3 8
# 4 9
# dtype: int32
#创建Series方法3
s3=pd.Series({'5':1,'6':3,'7':9})
print(s3)
# 5    1
# 6 3
# 7 9
# dtype: int64
print(s3.index)#Index(['5', '6', '7'], dtype='object')
#创建Series方法4
s4=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s4)
# A 1
# B 2
# C 3
# D 4
# dtype: int64 #取值
print(s4['A'])#1
print(s4[s4>2])
# C 3
# D 4
# dtype: int64 #将Series转换成字典
dict=s4.to_dict()
print(dict)#{'A': 1, 'B': 2, 'C': 3, 'D': 4} #将字典转换为Series
seri=pd.Series(dict)
print(seri)
# A 1
# B 2
# C 3
# D 4
# dtype: int64 #改变Series的index
index_1=['z','A','B','v','C']
s5=pd.Series(s4,index=index_1)
print(s5)
# z NaN
# A 1.0
# B 2.0
# v NaN
# C 3.0
# dtype: float64 #判断是不是null
print(pd.isnull(s5))
# z True
# A False
# B False
# v True
# C False
# dtype: bool
print(pd.notnull(s5))
# z False
# A True
# B True
# v False
# C True
# dtype: bool #给Series起名字
s5.name='demo'
print(s5)
# z NaN
# A 1.0
# B 2.0
# v NaN
# C 3.0
# Name: demo, dtype: float64 s5.index.name='demo index'
print(s5.index)#Index(['z', 'A', 'B', 'v', 'C'], dtype='object', name='demo index')
DataFrame:
from pandas import Series,DataFrame
import webbrowser
link='https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)#打开该网站
#复制网站中一下内容内容
'''
Jan 2019 Jan 2018 Change Programming Language Ratings Change.1
0 1 1 NaN Java 16.904% +2.69%
1 2 2 NaN C 13.337% +2.30%
2 3 4 change Python 8.294% +3.62%
3 4 3 change C++ 8.158% +2.55%
4 5 7 change Visual Basic .NET 6.459% +3.20%
'''
df=pd.read_clipboard()#从剪切板里创建DataFrame
type(df)#pandas.core.frame.DataFrame
print(df)#打印出和上述内容一样的DataFrame
# Jan 2019 Jan 2018 Change Programming Language Ratings Change.1
# 0 1 1 NaN Java 16.904% +2.69%
# 1 2 2 NaN C 13.337% +2.30%
# 2 3 4 change Python 8.294% +3.62%
# 3 4 3 change C++ 8.158% +2.55%
# 4 5 7 change Visual Basic .NET 6.459% +3.20%

  

#获取列名
print(df.columns)#Index(['Jan 2019', 'Jan 2018', 'Change', 'Programming Language', 'Ratings','Change.1'],dtype='object')

  

#获取某一列的value
print(df.Ratings)#获取Ratings列
# 0 16.904%
# 1 13.337%
# 2 8.294%
# 3 8.158%
# 4 6.459%
# Name: Ratings, dtype: object
print(df['Jan 2019'])#获取'Jan 2019'列,因为两个单词,所以不能用上式 获取两列则用print(df[['Jan 2019',Ratings]]),得到的类型为DataFrame
# 0 1
# 1 2
# 2 3
# 3 4
# 4 5
# Name: Jan 2019, dtype: int64
print(type(df.Ratings),type(df['Jan 2019']))#<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>

  

# 提取旧的DataFrame某些列生成新的DataFrame
df_new=DataFrame(df,columns=['Programming Language','Jan 2019'])
print(df_new)
# Programming Language Jan 2019
# 0 Java 1
# 1 C 2
# 2 Python 3
# 3 C++ 4
# 4 Visual Basic .NET 5

  

#提取旧的DataFrame某些列生成新的DataFrame,但新的DataFrame中有的列在旧的没有,会生成新的列
df_new2=DataFrame(df,columns=['new lie','Jan 2019'])
print(df_new2)
# new lie Jan 2019
# 0 NaN 1
# 1 NaN 2
# 2 NaN 3
# 3 NaN 4
# 4 NaN 5 #可以给new lie赋值
df_new2['new lie']=range(5,10)
df_new2['new lie']=np.arange(5,10)#也可以通过numpy赋值
df_new2['new lie']=pd.Series(np.arange(5,10))#也可以通过Series赋值
print(df_new2)
# new lie Jan 2019
# 0 5 1
# 1 6 2
# 2 7 3
# 3 8 4
# 4 9 5 df_new2['new lie']=pd.Series([200,200],index=[2,3])#指定某一列某一两个元素值的更改
print(df_new2)
# new lie Jan 2019
# 0 NaN 1
# 1 NaN 2
# 2 200.0 3
# 3 200.0 4
# 4 NaN 5

Series与DataFrame:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data={'country':['Belgium','India','Brazil'],
'Capital':['Brussels','New Delhi','Brasillia'],
'Population':[11190846,1303171035,207847528]} #Seiries
s1=pd.Series(data['country'])
# 0 Belgium
# 1 India
# 2 Brazil
# dtype: object
s1.values#array(['Belgium', 'India', 'Brazil'], dtype=object)
s1.index#RangeIndex(start=0, stop=3, step=1) #DataFrame
df1=pd.DataFrame(data)#通过字典创建DataFrame
# country Capital Population
# 0 Belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasillia 207847528
df1['country']#访问某一列
df1.country##访问某一列的另一种方式,效果同上
# 0 Belgium
# 1 India
# 2 Brazil
# Name: country, dtype: object
type(df1['country'])#pandas.core.series.Series #访问DataFrame的行
df1.iterrows()#<generator object DataFrame.iterrows at 0x0000000004E8F888>
for row in df1.iterrows():
print(row)
print('类型:',type(row))
print('长度:',len(row),'\n')
'''
(0, country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object)
类型: <class 'tuple'>
长度: 2 (1, country India
Capital New Delhi
Population 1303171035
Name: 1, dtype: object)
类型: <class 'tuple'>
长度: 2 (2, country Brazil
Capital Brasillia
Population 207847528
Name: 2, dtype: object)
类型: <class 'tuple'>
长度: 2
'''
for row in df1.iterrows():
print('第一个:',row[0])
print('第二个:', row[1],'\n')
print('类型:',type(row[0]),type(row[1]))
break
'''
第一个: 0
第二个: country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object 类型: <class 'int'> <class 'pandas.core.series.Series'>
''' #通过Series创建DataFrame
s1=pd.Series(data['Capital'])
s2=pd.Series(data['country'])
s3=pd.Series(data['Population'])
df_new1=pd.DataFrame([s1,s2,s3])
print(df_new1)
'''
0 1 2
0 Brussels New Delhi Brasillia
1 Belgium India Brazil
2 11190846 1303171035 207847528
'''
print(df_new1.T)# 转置
'''
0 1 2
0 Brussels Belgium 11190846
1 New Delhi India 1303171035
2 Brasillia Brazil 207847528
'''
df_new2=pd.DataFrame([s1,s2,s3],index=['Capital','country','Population']).T
print(df_new2)
'''
Capital country Population
0 Brussels Belgium 11190846
1 New Delhi India 1303171035
2 Brasillia Brazil 207847528
'''

 pandas中的DateFrame的IO操作:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link='http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)#打开该网站
#复制网站中一下内容内容
'''
Format Type Data Description Reader Writer
text CSV read_csv to_csv
text JSON read_json to_json
text HTML read_html to_html
text Local clipboard read_clipboard to_clipboard
binary MS Excel read_excel to_excel
binary HDF5 Format read_hdf to_hdf
binary Feather Format read_feather to_feather
binary Msgpack read_msgpack to_msgpack
binary Stata read_stata to_stata
binary SAS read_sas
binary Python Pickle Format read_pickle to_pickle
SQL SQL read_sql to_sql
SQL Google Big Query read_gbq to_gbq
'''
df1=pd.read_clipboard()
print(df1)
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''
df1.to_clipboard()#将df1的内容复制到粘贴板
df1.to_csv('df1.csv')#将df1的内容输出到df1.csv文件中,包括index
df1.to_csv('df11.csv',index=False)#将df1的内容输出到df2.csv文件中,但不包括index
df2=pd.read_csv('df11.csv')#读取csv文件
print(df2)
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''
df3=df1.to_json()#输出为json格式
print(df3)
'''
{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},
"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},
"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},
"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":" ","10":"to_pickle","11":"to_sql","12":"to_gbq"}}
'''
print(pd.read_json(df3))#读json格式
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
'''
df1.to_json('df1.json')#生成json文件
print(pd.read_json('df1.json'))#读取json文件
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
'''
df1.to_html('df1.html')#生成html文件
print(pd.read_html('df1.html'))#读取html文件
'''
[ Unnamed: 0 Format Type Data Description Reader Writer
0 0 text CSV read_csv to_csv
1 1 text JSON read_json to_json
2 2 text HTML read_html to_html
3 3 text Local clipboard read_clipboard to_clipboard
4 4 binary MS Excel read_excel to_excel
5 5 binary HDF5 Format read_hdf to_hdf
6 6 binary Feather Format read_feather to_feather
7 7 binary Msgpack read_msgpack to_msgpack
8 8 binary Stata read_stata to_stata
9 9 binary SAS read_sas NaN
10 10 binary Python Pickle Format read_pickle to_pickle
11 11 SQL SQL read_sql to_sql
12 12 SQL Google Big Query read_gbq to_gbq]
'''
df1.to_excel('df1.xlsx')#生成excell文件
print(pd.read_excel('df1.xlsx'))#读取excell文件
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''

Series和DataFrame的indexing

import numpy as np
import pandas as pd
imdb=pd.read_csv(r'C:\Users\Administrator\Desktop\py_work\codes\presidential_polls.csv')
print(imdb.shape)#查看行列数 (10236, 27)
print(imdb.head())#默认打印前五行
'''
cycle branch type matchup forecastdate \
0 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
1 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
2 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
3 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
4 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16 state startdate enddate pollster grade \
0 U.S. 10/25/2016 10/31/2016 Google Consumer Surveys B
1 U.S. 10/27/2016 10/30/2016 ABC News/Washington Post A+
2 Virginia 10/27/2016 10/30/2016 ABC News/Washington Post A+
3 Florida 10/20/2016 10/24/2016 SurveyUSA A
4 U.S. 10/20/2016 10/25/2016 Pew Research Center B+ ... adjpoll_clinton adjpoll_trump adjpoll_johnson \
0 ... 42.64140 40.86509 5.675099
1 ... 43.29659 44.72984 3.401513
2 ... 46.29779 40.72604 6.401513
3 ... 46.35931 45.30585 1.777730
4 ... 45.32744 42.20888 3.618320 adjpoll_mcmullin multiversions \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN url poll_id question_id \
0 https://datastudio.google.com/u/0/#/org//repor... 47940 74999
1 http://www.langerresearch.com/wp-content/uploa... 47881 74936
2 https://www.washingtonpost.com/local/virginia-... 47880 74934
3 http://www.baynews9.com/content/news/baynews9/... 47465 74252
4 http://www.people-press.org/2016/10/27/as-elec... 47616 74519 createddate timestamp
0 11/1/16 15:09:38 1 Nov 2016
1 11/1/16 15:09:38 1 Nov 2016
2 11/1/16 15:09:38 1 Nov 2016
3 10/25/16 15:09:38 1 Nov 2016
4 10/27/16 15:09:38 1 Nov 2016 [5 rows x 27 columns]
'''
print(imdb.tail())#默认打印后5行,与head用法相同
'''
cycle branch type matchup \
10231 2016 President polls-only Clinton vs. Trump vs. Johnson
10232 2016 President polls-only Clinton vs. Trump vs. Johnson
10233 2016 President polls-only Clinton vs. Trump vs. Johnson
10234 2016 President polls-only Clinton vs. Trump vs. Johnson
10235 2016 President polls-only Clinton vs. Trump vs. Johnson forecastdate state startdate enddate \
10231 11/1/16 Alabama 9/30/2016 10/13/2016
10232 11/1/16 Virginia 9/30/2016 10/6/2016
10233 11/1/16 Virginia 9/16/2016 9/22/2016
10234 11/1/16 North Carolina 6/20/2016 6/21/2016
10235 11/1/16 Utah 7/29/2016 8/18/2016 pollster grade ... adjpoll_clinton \
10231 Ipsos A- ... 37.30964
10232 Ipsos A- ... 49.13094
10233 Ipsos A- ... 45.97130
10234 Public Policy Polling B+ ... 45.29390
10235 Ipsos A- ... 31.62721 adjpoll_trump adjpoll_johnson adjpoll_mcmullin multiversions \
10231 54.76821 NaN NaN NaN
10232 39.41588 NaN NaN NaN
10233 39.97518 NaN NaN NaN
10234 46.66175 1.596946 NaN NaN
10235 44.65947 NaN NaN NaN url poll_id \
10231 http://reuters.com/statesofthenation/ 46817
10232 http://www.reuters.com/statesofthenation/ 46675
10233 http://www.reuters.com/statesofthenation/ 46096
10234 http://www.publicpolicypolling.com/pdf/2015/PP... 44400
10235 http://www.reuters.com/statesofthenation 44978 question_id createddate timestamp
10231 73263 10/15/16 14:57:58 1 Nov 2016
10232 72969 10/10/16 14:57:58 1 Nov 2016
10233 72088 9/26/16 14:57:58 1 Nov 2016
10234 67363 6/23/16 14:57:58 1 Nov 2016
10235 69011 8/24/16 14:57:58 1 Nov 2016 [5 rows x 27 columns]
'''

  

print(imdb.iloc[10:13,0:5])#查看第10到12行,0到4列(iloc通过index搜索的,基于位置信息,类似切片,不包含末尾位置)
'''
cycle branch type matchup forecastdate
10 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
11 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
12 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
'''
df=imdb.iloc[10:13,0:5]
print(df.iloc[1:3,1:3])
'''
branch type
11 President polls-plus
12 President polls-plus
'''
print(df.loc[10:12,:'type'])#loc是通过lable查询的,基于lable信息查询,包含末尾位置
'''
cycle branch type
10 2016 President polls-plus
11 2016 President polls-plus
12 2016 President polls-plus
'''
print(imdb['adjpoll_clinton'].head())#查看某列
'''
0 42.64140
1 43.29659
2 46.29779
3 46.35931
4 45.32744
Name: adjpoll_clinton, dtype: float64
'''
print(imdb['adjpoll_clinton'][10])#查看某个元素 44.53217
print(imdb[['adjpoll_trump','adjpoll_johnson']])#通过某(些)列生成新的DataFrame
'''
adjpoll_trump adjpoll_johnson
0 40.86509 5.675099
1 44.72984 3.401513
2 40.72604 6.401513
3 45.30585 1.777730
4 42.20888 3.618320
5 42.26663 6.114222
6 43.56017 3.153590
7 43.50333 3.466432
8 37.24948 6.420006
9 41.69540 4.220173
10 43.84845 NaN
11 47.92262 2.676897
12 29.50605 3.170510
13 40.34972 5.823322
14 42.01937 6.499082
15 45.07725 3.499082
16 39.33826 5.044833
17 46.11255 3.054228
18 39.80679 6.359501
19 41.34735 4.421316
20 39.99571 6.272840
21 50.75720 NaN
22 38.87231 8.359501
23 41.55637 4.964521
24 43.84806 5.359501
25 45.03370 2.193952
26 44.78595 4.359501
27 44.18040 5.160502
28 40.41809 3.333669
29 49.47709 4.308866
... ... ...
10206 36.75014 9.152230
10207 40.08237 NaN
10208 43.67710 NaN
10209 43.40106 NaN
10210 35.52956 NaN
10211 35.03328 NaN
10212 44.77681 NaN
10213 38.24798 NaN
10214 41.25978 NaN
10215 41.59738 NaN
10216 41.64499 1.974752
10217 36.15054 NaN
10218 38.65057 NaN
10219 29.49314 9.007062
10220 37.87221 NaN
10221 39.42957 NaN
10222 53.95455 NaN
10223 33.07150 3.328916
10224 41.88533 1.974752
10225 36.82408 9.741756
10226 47.80848 NaN
10227 42.01089 3.671217
10228 45.06726 NaN
10229 40.16534 12.889780
10230 41.56030 2.872088
10231 54.76821 NaN
10232 39.41588 NaN
10233 39.97518 NaN
10234 46.66175 1.596946
10235 44.65947 NaN [10236 rows x 2 columns]
'''  

Series和DataFrame的Reindexing

import numpy as np
import pandas as pd
s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
A    1
B 2
C 3
D 4
dtype: int64
print(s1.reindex(['A','C','E']))
A    1.0
C 3.0
E NaN
dtype: float64
print(s1.reindex(['A','C','m'],fill_value=11))#通过fille_value填充数值
A     1
C 3
m 11
dtype: int64
s2=pd.Series(['a','b','c'],index=[1,5,10])
print(s2)
1     a
5 b
10 c
dtype: object
print(s2.reindex(index=range(15)))
0     NaN
1 a
2 NaN
3 NaN
4 NaN
5 b
6 NaN
7 NaN
8 NaN
9 NaN
10 c
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object
print(s2.reindex(index=range(15),method='ffill'))#自动填充,第0个是NaN,第1到4用a填充(<=4),第5到9用b填充(大于等于5小于10),大于等于10用c填充
0     NaN
1 a
2 a
3 a
4 a
5 b
6 b
7 b
8 b
9 b
10 c
11 c
12 c
13 c
14 c
dtype: object
print(s2)
1     a
5 b
10 c
dtype: object
df1=pd.DataFrame(np.random.rand(25).reshape([5,5]))
print(df1)
          0         1         2         3         4
0 0.499115 0.244375 0.849224 0.348352 0.472657
1 0.676503 0.769790 0.479774 0.468003 0.703029
2 0.153982 0.699009 0.379184 0.151905 0.921860
3 0.904037 0.196925 0.421180 0.384442 0.642122
4 0.641124 0.748790 0.824351 0.101550 0.412564
df2=pd.DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
print(df2)
         c1        c2        c3        c4        c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
B 0.660053 0.088954 0.512298 0.259552 0.108562
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.reindex(index=['A','B','C','D','E','F']))
         c1        c2        c3        c4        c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
B 0.660053 0.088954 0.512298 0.259552 0.108562
C NaN NaN NaN NaN NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.reindex(columns=['c1','c2','c3','c4','c5','c6']))
         c1        c2        c3        c4        c5  c6
A 0.279563 0.267224 0.077868 0.080046 0.528182 NaN
B 0.660053 0.088954 0.512298 0.259552 0.108562 NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753 NaN
E 0.926365 0.729410 0.328161 0.531319 0.550878 NaN
F 0.849754 0.770988 0.537104 0.833631 0.062303 NaN
print(df2.reindex(index=['A','B','C','D','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
         c1        c2        c3        c4        c5  c6
A 0.279563 0.267224 0.077868 0.080046 0.528182 NaN
B 0.660053 0.088954 0.512298 0.259552 0.108562 NaN
C NaN NaN NaN NaN NaN NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753 NaN
E 0.926365 0.729410 0.328161 0.531319 0.550878 NaN
F 0.849754 0.770988 0.537104 0.833631 0.062303 NaN
s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
A    1
B 2
C 3
D 4
dtype: int64
print(s1.reindex(['A','C']))#也可写成print(s1.reindex(index=['A','C']))
A    1
C 3
dtype: int64
print(df2.reindex(index=['A','C']))
         c1        c2        c3        c4        c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
C NaN NaN NaN NaN NaN
print(s1.drop(['B','C']))
A    1
D 4
dtype: int64
print(s1.drop('A'))
B    2
C 3
D 4
dtype: int64
print(df2.drop(['A'],axis=0))
         c1        c2        c3        c4        c5
B 0.660053 0.088954 0.512298 0.259552 0.108562
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.drop(['c1'],axis=1))
         c2        c3        c4        c5
A 0.267224 0.077868 0.080046 0.528182
B 0.088954 0.512298 0.259552 0.108562
D 0.776419 0.581695 0.578712 0.157753
E 0.729410 0.328161 0.531319 0.550878
F 0.770988 0.537104 0.833631 0.062303

谈一谈NaN-means Not a Number

n=np.nan
print(type(n))#<class 'float'>
print(1+n)#结果:nan 任何一个numuber与nan做运算结果永远都是not a nunmber

  

s1=pd.Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
print(s1)
A    1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64
print(s1.isnull())
A    False
B False
C True
D False
E False
dtype: bool

View Cod

print(s1.notnull())
A     True
B True
C False
D True
E True
dtype: bool
print(s1.dropna())#drop掉value为nan的
A    1.0
B 2.0
D 3.0
E 4.0
dtype: float64

NaN in DataFrame

dframe=pd.DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
print(dframe)
     0    1    2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
3 NaN NaN NaN
print(dframe.isnull())
       0      1      2
0 False False False
1 True False False
2 False True False
3 True True True
print(dframe.notnull())
       0      1      2
0 True True True
1 False True True
2 True False True
3 False False False
print(dframe.dropna())#默认axis=0,相当于print(dframe.dropna(axis=0))    默认how='any'
     0    1    2
0 1.0 2.0 3.0
print(dframe.dropna(how='any'))#any指的是凡是含有nan的都会drop掉
     0    1    2
0 1.0 2.0 3.0
print(dframe.dropna(how='all'))#all指的是所有都是all的都会drop掉
     0    1    2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
print(dframe.dropna(axis=1))#只剩下index了
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
dframe2=pd.DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
print(dframe2)
     0    1    2    3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0
3 1.0 NaN NaN NaN
df2=dframe2.dropna()#默认thresh=None,相当于df2=dframe2.dropna(thresh=None)
print(df2)
Empty DataFrame
Columns: [0, 1, 2, 3]
Index: []
df3=dframe2.dropna(thresh=2)#只要一行中NaN个数大于2,就删除该行
print(df3)
     0    1    2    3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0
print(dframe2.fillna(value=10))#将NaN填充为10
      0     1     2     3
0 1.0 2.0 3.0 10.0
1 2.0 10.0 5.0 6.0
2 10.0 7.0 10.0 9.0
3 1.0 10.0 10.0 10.0
print(dframe2.fillna(value={0:'A',1:'16',2:'中国',3:'k'}))#将每列各自的NaN赋值,即:第0列用A填充,第1列用16填充。。。。。。
#注意:fillna和dropna不会改变原本的Series和DataFrame
   0   1   2  3
0 1 2 3 k
1 2 16 5 6
2 A 7 中国 9
3 1 16 中国 k

多级index

s1=pd.Series(np.random.rand(6),index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
print(s1)
1  a    0.973831
b 0.762415
c 0.135763
2 a 0.974687
b 0.471638
c 0.573157
dtype: float64
print(type(s1))#<class 'pandas.core.series.Series'>
print(s1['1'])
a    0.973831
b 0.762415
c 0.135763
dtype: float64
print(type(s1['1']))#<class 'pandas.core.series.Series'>
print(s1['1']['a'])#0.9738309965219155
print(s1[:,'a'])
1    0.973831
2 0.974687
dtype: float64
#二级的series转换成dataframe(两种方法)
df1=s1.unstack()
print(df1) df2=pd.DataFrame([s1['1'],s1['2']])
print(df2)
          a         b         c
1 0.973831 0.762415 0.135763
2 0.974687 0.471638 0.573157
a b c
0 0.973831 0.762415 0.135763
1 0.974687 0.471638 0.573157
#dataframe转换成二级series
s2=df1.unstack()
print(s2)
a  1    0.973831
2 0.974687
b 1 0.762415
2 0.471638
c 1 0.135763
2 0.573157
dtype: float64
print(df1.T.unstack())
1  a    0.973831
b 0.762415
c 0.135763
2 a 0.974687
b 0.471638
c 0.573157
dtype: float64
df=pd.DataFrame(np.arange(16).reshape(4,4),index=[['a','a','b','b'],[1,2,1,2]],columns=[['BJ','BJ','上海','广州'],[111,222,111,222]])
print(df)
     BJ      上海  广州
111 222 111 222
a 1 0 1 2 3
2 4 5 6 7
b 1 8 9 10 11
2 12 13 14 15
print(df['BJ'])
     111  222
a 1 0 1
2 4 5
b 1 8 9
2 12 13
print(type(df['BJ']))#<class 'pandas.core.frame.DataFrame'>
print(df['BJ',111])#print(df['BJ'][111])效果相同
a  1     0
2 4
b 1 8
2 12
Name: (BJ, 111), dtype: int32

Mapping

df1=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]})
print(df1)
   城市    人口
0 北京 1000
1 上海 2000
2 广州 1500
#增加一列
df1['GDP']=pd.Series([9999,8888,7777])#方法一
print(df1)
   城市    人口   GDP
0 北京 1000 9999
1 上海 2000 8888
2 广州 1500 7777
salary={'北京':10,'上海':20,'广州':30}#方法二,尽量用此方法,原因看df2
df1['工资']=df1['城市'].map(salary)
print(df1)
   城市    人口   GDP  工资
0 北京 1000 9999 10
1 上海 2000 8888 20
2 广州 1500 7777 30
df2=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]},index=['A','B','C'])
print(df2)
   城市    人口
A 北京 1000
B 上海 2000
C 广州 1500
df2['GDP']=pd.Series([9999,8888,7777])
print(df2)
   城市    人口  GDP
A 北京 1000 NaN
B 上海 2000 NaN
C 广州 1500 NaN
df2['GGDDPP']=pd.Series([9999,8888,7777],index=['A','B','C'])
print(df2)
   城市    人口  GDP  GGDDPP
A 北京 1000 NaN 9999
B 上海 2000 NaN 8888
C 广州 1500 NaN 7777

Replace

s1=pd.Series(np.arange(100,105))
print(s1)
0    100
1 101
2 102
3 103
4 104
dtype: int32
print(s1.replace(101,np.nan))
0    100.0
1 NaN
2 102.0
3 103.0
4 104.0
dtype: float64
print(s1.replace({101:np.nan}))
0    100.0
1 NaN
2 102.0
3 103.0
4 104.0
dtype: float64
print(s1.replace([100,103,104],['中','eng','s']))
0      中
1 101
2 102
3 eng
4 s
dtype: object
print(s1)#s1并没有发生变化
0    100
1 101
2 102
3 103
4 104
dtype: int32

 

  

二、pandas入门的更多相关文章

  1. 利用Python进行数据分析——pandas入门

    利用Python进行数据分析--pandas入门 基于NumPy建立的 from pandas importSeries,DataFrame,import pandas as pd 一.两种数据结构 ...

  2. 利用python进行数据分析之pandas入门

    转自https://zhuanlan.zhihu.com/p/26100976 目录: 5.1 pandas 的数据结构介绍5.1.1 Series5.1.2 DataFrame5.1.3索引对象5. ...

  3. < 利用Python进行数据分析 - 第2版 > 第五章 pandas入门 读书笔记

    <利用Python进行数据分析·第2版>第五章 pandas入门--基础对象.操作.规则 python引用.浅拷贝.深拷贝 / 视图.副本 视图=引用 副本=浅拷贝/深拷贝 浅拷贝/深拷贝 ...

  4. XML学习总结(二)——XML入门

    XML学习总结(二)——XML入门 一.XML语法学习 学习XML语法的目的就是编写XML 一个XML文件分为如下几部分内容: 文档声明 元素 属性 注释 CDATA区 .特殊字符 处理指令(proc ...

  5. Spring+SpringMVC+MyBatis深入学习及搭建(十二)——SpringMVC入门程序(一)

    转载请注明出处:http://www.cnblogs.com/Joanna-Yan/p/6999743.html 前面讲到:Spring+SpringMVC+MyBatis深入学习及搭建(十一)——S ...

  6. 基于tensorflow的MNIST手写数字识别(二)--入门篇

    http://www.jianshu.com/p/4195577585e6 基于tensorflow的MNIST手写字识别(一)--白话卷积神经网络模型 基于tensorflow的MNIST手写数字识 ...

  7. 转:JAVAWEB开发之权限管理(二)——shiro入门详解以及使用方法、shiro认证与shiro授权

    原文地址:JAVAWEB开发之权限管理(二)——shiro入门详解以及使用方法.shiro认证与shiro授权 以下是部分内容,具体见原文. shiro介绍 什么是shiro shiro是Apache ...

  8. Python 数据处理库 pandas 入门教程

    Python 数据处理库 pandas 入门教程2018/04/17 · 工具与框架 · Pandas, Python 原文出处: 强波的技术博客 pandas是一个Python语言的软件包,在我们使 ...

  9. 深入浅出 JMS(二) - ActiveMQ 入门指南

    深入浅出 JMS(二) - ActiveMQ 入门指南 上篇博文深入浅出 JMS(一) – JMS 基本概念,我们介绍了消息通信的规范JMS,这篇博文介绍一款开源的 JMS 具体实现-- Active ...

  10. 利用python进行数据分析--pandas入门2

    随书练习,第五章  pandas入门2 # coding: utf-8 # In[1]: from pandas import Series,DataFrame import pandas as pd ...

随机推荐

  1. CodeForces 623B【预处理+DP】

    题意: 给出n,a,b以及n个整数a1,a2-an, 可以对数组进行以下两种操作: (1)花费len*a的代价删除连续的len个数,len<|S| (2)花费b的代价将某一个a[i]加一或减一, ...

  2. Unity3D中调用外接摄像头,并保存为图片文件

    http://bbs.9ria.com/thread-170539-1-1.html 项目要求调用摄像头,并且把图像保存下来,上传到服务器. 这里有几个难点,调用摄像头是很简单的,unity已经提供好 ...

  3. Android 跨应用调用Activity

    http://blog.csdn.net/ouyangliping/article/details/7972141 如何调用另外一个app应用的activity或者service,本文提供一个验证可行 ...

  4. (转)关于MongoDB你需要知道的几件事

    本文列举了颇让作者困惑的一些MongoDB限制,如果你也打算使用MongoDB,那么至少要提前了解这些限制,以免遇到的时候措手不及. 消耗磁盘空间 这是我的第一个困惑:MongoDB会消耗太多的磁盘空 ...

  5. [題解]luogu_P1144最短路計數

    1.無權圖最短路邊權為1 2.如果兩個點恰好不能被更新(d[y]==d[x]+1)那麼就能通過x的所有最短路到達y,所以ans[y]+=ans[x] 3.如果兩個點不能恰好被更新(d[y]>d[ ...

  6. git 版本控制(一)

    新建代码库repository 1.在当前目录新建一个git代码库 git init git init projectname 2.下载一个项目,如果已经有了远端的代码,则可以使用clone下载 gi ...

  7. light OJ 1282 - Leading and Trailing 数学 || double技巧

    http://lightoj.com/volume_showproblem.php?problem=1282 #include <cstdio> #include <cstdlib& ...

  8. arcengine geometry union操作

    以前得到的结果老是某一个,用下面的方法就可以获取合并后的结果 IGeometry pUnionGeo = null; var bFirst = true; foreach (IGeometry pGe ...

  9. IP Addresses of Google Global Cache

    Bulgaria 93.123.23.1 93.123.23.2 93.123.23.3 93.123.23.4 93.123.23.5 93.123.23.6 93.123.23.7 93.123. ...

  10. T4310 祖玛游戏

    题目描述 祖玛是一款曾经风靡全球的游戏,其玩法是:在一条轨道上初始排列着若干 个彩色珠子,其中任意三个相邻的珠子不会完全同色.此后,你可以发射珠子到 轨道上并加入原有序列中.一旦有三个或更多同色的珠子 ...