Pandas 精简实例入门
import pandas as pd
import numpy as np
0. 案例引入
# 由np直接生成的ndarray
stock_change = np.random.normal(0, 1, (10, 8))
stock_change
array([[ 0.74057955, 0.78604657, -0.15264135, 0.05680483, 0.09388135,
0.7313751 , -1.52338443, 1.71156505],
[ 0.42204925, 0.62541715, -1.41583042, -0.27434654, 0.98587136,
-0.55797884, 0.31026482, -0.47964535],
[ 0.99741102, -0.94397298, -0.40782973, -1.33631227, -0.0124836 ,
1.1873408 , -0.25430393, -0.74264106],
[ 0.34156662, -0.40621262, 0.82861416, 0.1272128 , 1.04101412,
0.79061324, -0.60325544, 1.29954581],
[-1.23289547, 0.83789748, 1.19276989, 0.45092868, -1.7418129 ,
-0.65362211, -0.17752493, 1.87679286],
[-0.4268705 , 1.14017572, 0.18261009, -0.28947877, 0.82489897,
0.11566058, -0.53191371, -0.96065812],
[ 0.92792797, 0.26086313, 0.08316582, -0.94533007, -0.77956139,
0.23006703, -0.81971461, -1.36742474],
[ 0.82241768, 0.54201367, -0.19331564, 0.50576697, -0.42545839,
-0.24247517, -0.03526651, -0.02268451],
[ 1.67480093, -1.23265948, -2.88199942, -1.07761987, -1.37844497,
-0.13581683, 2.06013919, 1.18986057],
[ 0.60744357, 0.52348326, 0.76418263, -0.73385554, 0.54857341,
0.27310645, -0.26464179, 0.77370496]])
# 通过pd.DataFrame生成 (pd.DataFrame(ndarray))
stock_df = pd.DataFrame(stock_change)
stock_df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
1 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
2 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
3 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
4 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
5 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
6 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
7 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
8 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
9 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
stock_df.shape
(10, 8)
# 添加行索引
stock_name = ['股票{}'.format(i+1) for i in range(stock_df.shape[0])]
stock_df = pd.DataFrame(data=stock_change, index=stock_name)
stock_df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
股票1 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
股票2 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
股票3 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
股票4 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
股票5 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
股票6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
股票7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
股票8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
股票9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
股票10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
# 添加列索引
# 引入df.date_range(),start-开始日期, end: 结束日期, periods - 持续时间, frep- B:工作日, M:月, D:天
date = pd.date_range(start='2020-3-30', periods=stock_df.shape[1], freq='d')
date
DatetimeIndex(['2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02',
'2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06'],
dtype='datetime64[ns]', freq='D')
stock_df = pd.DataFrame(stock_change, index=stock_name, columns=date)
stock_df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
股票1 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
股票2 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
股票3 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
股票4 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
股票5 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
股票6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
股票7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
股票8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
股票9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
股票10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
1. Pandas 主要数据结构
1.1 DataFrame
stock_df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
股票1 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
股票2 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
股票3 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
股票4 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
股票5 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
股票6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
股票7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
股票8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
股票9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
股票10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
# 查看DataFrame形状,类似于2d array
stock_df.shape
(10, 8)
# 取行索引
stock_df.index
Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')
# 取列索引
stock_df.columns
DatetimeIndex(['2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02',
'2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06'],
dtype='datetime64[ns]', freq='D')
# 取ndarray的值
stock_df.values
array([[ 0.74057955, 0.78604657, -0.15264135, 0.05680483, 0.09388135,
0.7313751 , -1.52338443, 1.71156505],
[ 0.42204925, 0.62541715, -1.41583042, -0.27434654, 0.98587136,
-0.55797884, 0.31026482, -0.47964535],
[ 0.99741102, -0.94397298, -0.40782973, -1.33631227, -0.0124836 ,
1.1873408 , -0.25430393, -0.74264106],
[ 0.34156662, -0.40621262, 0.82861416, 0.1272128 , 1.04101412,
0.79061324, -0.60325544, 1.29954581],
[-1.23289547, 0.83789748, 1.19276989, 0.45092868, -1.7418129 ,
-0.65362211, -0.17752493, 1.87679286],
[-0.4268705 , 1.14017572, 0.18261009, -0.28947877, 0.82489897,
0.11566058, -0.53191371, -0.96065812],
[ 0.92792797, 0.26086313, 0.08316582, -0.94533007, -0.77956139,
0.23006703, -0.81971461, -1.36742474],
[ 0.82241768, 0.54201367, -0.19331564, 0.50576697, -0.42545839,
-0.24247517, -0.03526651, -0.02268451],
[ 1.67480093, -1.23265948, -2.88199942, -1.07761987, -1.37844497,
-0.13581683, 2.06013919, 1.18986057],
[ 0.60744357, 0.52348326, 0.76418263, -0.73385554, 0.54857341,
0.27310645, -0.26464179, 0.77370496]])
# 取转置
stock_df.T
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
股票1 | 股票2 | 股票3 | 股票4 | 股票5 | 股票6 | 股票7 | 股票8 | 股票9 | 股票10 | |
---|---|---|---|---|---|---|---|---|---|---|
2020-03-30 | 0.740580 | 0.422049 | 0.997411 | 0.341567 | -1.232895 | -0.426871 | 0.927928 | 0.822418 | 1.674801 | 0.607444 |
2020-03-31 | 0.786047 | 0.625417 | -0.943973 | -0.406213 | 0.837897 | 1.140176 | 0.260863 | 0.542014 | -1.232659 | 0.523483 |
2020-04-01 | -0.152641 | -1.415830 | -0.407830 | 0.828614 | 1.192770 | 0.182610 | 0.083166 | -0.193316 | -2.881999 | 0.764183 |
2020-04-02 | 0.056805 | -0.274347 | -1.336312 | 0.127213 | 0.450929 | -0.289479 | -0.945330 | 0.505767 | -1.077620 | -0.733856 |
2020-04-03 | 0.093881 | 0.985871 | -0.012484 | 1.041014 | -1.741813 | 0.824899 | -0.779561 | -0.425458 | -1.378445 | 0.548573 |
2020-04-04 | 0.731375 | -0.557979 | 1.187341 | 0.790613 | -0.653622 | 0.115661 | 0.230067 | -0.242475 | -0.135817 | 0.273106 |
2020-04-05 | -1.523384 | 0.310265 | -0.254304 | -0.603255 | -0.177525 | -0.531914 | -0.819715 | -0.035267 | 2.060139 | -0.264642 |
2020-04-06 | 1.711565 | -0.479645 | -0.742641 | 1.299546 | 1.876793 | -0.960658 | -1.367425 | -0.022685 | 1.189861 | 0.773705 |
# 查看头部几行数据, 默认5行
stock_df.head(5)
# 查看倒数几行数据
stock_df.tail()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
股票6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
股票7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
股票8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
股票9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
股票10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
1.1.1 设置索引
# 只能通过对整个index 重新赋值, 整行或者整列
data_index = [['股票__{}'.format(i+1) for i in range(stock_df.shape[0])]]
stock_df.index = data_index
stock_df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
股票__1 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
股票__2 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
股票__3 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
股票__4 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
股票__5 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
股票__6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
股票__7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
股票__8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
股票__9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
股票__10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
# stock_df.index[3] ='hahha'
# stock_df
1.1.2 重设索引
# reset_index在原来基础上新增一列索引
# drop=False(默认) - 不丢弃原来索引
# drop=True - 丢掉原来索引 index
stock_df.reset_index()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
level_0 | 2020-03-30 00:00:00 | 2020-03-31 00:00:00 | 2020-04-01 00:00:00 | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | |
---|---|---|---|---|---|---|---|---|---|
0 | 股票__1 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
1 | 股票__2 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
2 | 股票__3 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
3 | 股票__4 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
4 | 股票__5 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
5 | 股票__6 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
6 | 股票__7 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
7 | 股票__8 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
8 | 股票__9 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
9 | 股票__10 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
stock_df.reset_index(drop=True)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
0 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
1 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
2 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
3 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
4 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
5 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
6 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
7 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
8 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
9 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
1.1.3 以某列为索引
stock_df.set_index(keys='2020-03-30', drop=False) #此处因为类型问题,都是drop 原来的index
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2020-03-30 | 2020-03-31 | 2020-04-01 | 2020-04-02 | 2020-04-03 | 2020-04-04 | 2020-04-05 | 2020-04-06 | |
---|---|---|---|---|---|---|---|---|
2020-03-30 | ||||||||
0.740580 | 0.740580 | 0.786047 | -0.152641 | 0.056805 | 0.093881 | 0.731375 | -1.523384 | 1.711565 |
0.422049 | 0.422049 | 0.625417 | -1.415830 | -0.274347 | 0.985871 | -0.557979 | 0.310265 | -0.479645 |
0.997411 | 0.997411 | -0.943973 | -0.407830 | -1.336312 | -0.012484 | 1.187341 | -0.254304 | -0.742641 |
0.341567 | 0.341567 | -0.406213 | 0.828614 | 0.127213 | 1.041014 | 0.790613 | -0.603255 | 1.299546 |
-1.232895 | -1.232895 | 0.837897 | 1.192770 | 0.450929 | -1.741813 | -0.653622 | -0.177525 | 1.876793 |
-0.426871 | -0.426871 | 1.140176 | 0.182610 | -0.289479 | 0.824899 | 0.115661 | -0.531914 | -0.960658 |
0.927928 | 0.927928 | 0.260863 | 0.083166 | -0.945330 | -0.779561 | 0.230067 | -0.819715 | -1.367425 |
0.822418 | 0.822418 | 0.542014 | -0.193316 | 0.505767 | -0.425458 | -0.242475 | -0.035267 | -0.022685 |
1.674801 | 1.674801 | -1.232659 | -2.881999 | -1.077620 | -1.378445 | -0.135817 | 2.060139 | 1.189861 |
0.607444 | 0.607444 | 0.523483 | 0.764183 | -0.733856 | 0.548573 | 0.273106 | -0.264642 | 0.773705 |
# 字典方式创建DataFrame
df = pd.DataFrame({'month': [1, 4, 7, 10],
'year': [2012, 2014, 2013, 2014],
'sale':[55, 40, 84, 31]})
df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
month | year | sale | |
---|---|---|---|
0 | 1 | 2012 | 55 |
1 | 4 | 2014 | 40 |
2 | 7 | 2013 | 84 |
3 | 10 | 2014 | 31 |
df.set_index(keys='month')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
year | sale | |
---|---|---|
month | ||
1 | 2012 | 55 |
4 | 2014 | 40 |
7 | 2013 | 84 |
10 | 2014 | 31 |
# 设置2个index, 就是MultiIndex (三维数据结构)
# df.set_index(keys=['month', 'year'])
df.set_index(['month', 'year'])
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
sale | ||
---|---|---|
month | year | |
1 | 2012 | 55 |
4 | 2014 | 40 |
7 | 2013 | 84 |
10 | 2014 | 31 |
1.2 MultiIndex
df_m = df.set_index(['year', 'month'])
df_m
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
sale | ||
---|---|---|
year | month | |
2012 | 1 | 55 |
2014 | 4 | 40 |
2013 | 7 | 84 |
2014 | 10 | 31 |
# index属性
# - names: levers的名称
# - levels: 每个level的列表值
df_m.index
MultiIndex([(2012, 1),
(2014, 4),
(2013, 7),
(2014, 10)],
names=['year', 'month'])
df_m.index.names
FrozenList(['year', 'month'])
df_m.index.levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
1.3 Series
# 自动生成从0开始的行索引 index
# Data must be 1-dimensional
pd.Series(np.arange(10))
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
# 手动指定index值
pd.Series([6.7,5.6,3,10,2], index=['a', 'b', 'c', 'd', 'e'])
a 6.7
b 5.6
c 3.0
d 10.0
e 2.0
dtype: float64
# 通过字典创建
se = pd.Series({'red':100, 'blue':200, 'green': 500, 'yellow':1000})
se
red 100
blue 200
green 500
yellow 1000
dtype: int64
# 取索引
se.index
Index(['red', 'blue', 'green', 'yellow'], dtype='object')
# 取array值
se.values
array([ 100, 200, 500, 1000], dtype=int64)
pd.Series(np.random.normal(0, 1, (10)))
0 -0.975747
1 0.021589
2 -0.384579
3 -0.412900
4 0.218133
5 -0.866525
6 -0.777209
7 -1.032130
8 0.202134
9 0.295274
dtype: float64
2.基本数据操作
2.1 索引操作
# 使用pd.read_csv()读取本地数据
data = pd.read_csv('./data/stock_day.csv')
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | ma5 | ma10 | ma20 | v_ma5 | v_ma10 | v_ma20 | turnover | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 22.942 | 22.142 | 22.875 | 53782.64 | 46738.65 | 55576.11 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 22.406 | 21.955 | 22.942 | 40827.52 | 42736.34 | 56007.50 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 21.938 | 21.929 | 23.022 | 35119.58 | 41871.97 | 56372.85 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 21.446 | 21.909 | 23.137 | 35397.58 | 39904.78 | 60149.60 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 21.366 | 21.923 | 23.253 | 33590.21 | 42935.74 | 61716.11 | 0.58 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 13.112 | 13.112 | 13.112 | 115090.18 | 115090.18 | 115090.18 | 6.16 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 12.820 | 12.820 | 12.820 | 98904.79 | 98904.79 | 98904.79 | 3.19 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 12.707 | 12.707 | 12.707 | 100812.93 | 100812.93 | 100812.93 | 2.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 12.610 | 12.610 | 12.610 | 117681.67 | 117681.67 | 117681.67 | 4.76 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 12.520 | 12.520 | 12.520 | 96291.73 | 96291.73 | 96291.73 | 3.30 |
643 rows × 14 columns
# 去除一些列,简化数据
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1) # axis=1 去除对应的列,与numpy相反
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 0.58 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 6.16 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 3.19 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 2.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 4.76 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 3.30 |
643 rows × 8 columns
2.1.1 直接使用行列索引
# 必须先列后行
data['high']['2018-02-27']
25.88
# data['2018-02-27']['high']
2.1.2 使用loc和iloc取索引
# loc取字符串, 先行后列
data.loc['2018-02-27']['high']
25.88
# 两种取值方式都可以
data.loc['2018-02-27','high']
25.88
data.loc['2018-02-27':'2018-02-22', 'open']
2018-02-27 23.53
2018-02-26 22.80
2018-02-23 22.88
2018-02-22 22.25
Name: open, dtype: float64
# data.loc['high']['2018-02-27']
# iloc取索引数字,先行后列
data.iloc[:3, 3:5]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
low | volume | |
---|---|---|
2018-02-27 | 23.53 | 95578.03 |
2018-02-26 | 22.80 | 60985.11 |
2018-02-23 | 22.71 | 52914.01 |
2.1.3 使用ix取混合索引
# ix可以去数字和字符串, 先行后列
# 现版本中已被取消
# data.ix[0:4, ['open', 'close', 'high', 'low']]
# 先通过data.index去除索引并切片
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | close | high | low | |
---|---|---|---|---|
2018-02-27 | 23.53 | 24.16 | 25.88 | 23.53 |
2018-02-26 | 22.80 | 23.53 | 23.78 | 22.80 |
2018-02-23 | 22.88 | 22.82 | 23.37 | 22.71 |
2018-02-22 | 22.25 | 22.28 | 22.76 | 22.02 |
data.index
Index(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22', '2018-02-14',
'2018-02-13', '2018-02-12', '2018-02-09', '2018-02-08', '2018-02-07',
...
'2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10', '2015-03-09',
'2015-03-06', '2015-03-05', '2015-03-04', '2015-03-03', '2015-03-02'],
dtype='object', length=643)
data.iloc[0:4, data.columns.get_indexer(['open', 'close', 'high', 'low'])]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | close | high | low | |
---|---|---|---|---|
2018-02-27 | 23.53 | 24.16 | 25.88 | 23.53 |
2018-02-26 | 22.80 | 23.53 | 23.78 | 22.80 |
2018-02-23 | 22.88 | 22.82 | 23.37 | 22.71 |
2018-02-22 | 22.25 | 22.28 | 22.76 | 22.02 |
data.columns.get_indexer(['close'])
array([2], dtype=int64)
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 0.58 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 6.16 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 3.19 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 2.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 4.76 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 3.30 |
643 rows × 8 columns
2.2 赋值操作
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 0.58 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 6.16 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 3.19 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 2.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 4.76 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 3.30 |
643 rows × 8 columns
# 赋值方式1, 直接取对应的属性值
data.volume = 100
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 100 | 0.44 | 2.05 | 0.58 |
# 赋值方式2, 类似于取切片
data['low'] = 100
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 100 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 100 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 100 | 100 | 0.44 | 2.05 | 0.58 |
# 直接取出Series
data.open.head()
2018-02-27 23.53
2018-02-26 22.80
2018-02-23 22.88
2018-02-22 22.25
2018-02-14 21.49
Name: open, dtype: float64
# 直接取出Series
data['open'].head()
2018-02-27 23.53
2018-02-26 22.80
2018-02-23 22.88
2018-02-22 22.25
2018-02-14 21.49
Name: open, dtype: float64
2.3 排序
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 100 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 100 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 100 | 100 | 0.44 | 2.05 | 0.58 |
2.3.1 以特征值排序
# by --> 传入特征值, 可以传一个或者多个,以列表形式,排前面的作为高优先级,默认升序
data.sort_values(by='open', ascending=False)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2015-06-15 | 34.99 | 34.99 | 31.69 | 100 | 100 | -3.52 | -10.00 | 6.82 |
2015-06-12 | 34.69 | 35.98 | 35.21 | 100 | 100 | 0.82 | 2.38 | 5.47 |
2015-06-10 | 34.10 | 36.35 | 33.85 | 100 | 100 | 0.51 | 1.53 | 9.21 |
2017-11-01 | 33.85 | 34.34 | 33.83 | 100 | 100 | -0.61 | -1.77 | 5.81 |
2015-06-11 | 33.17 | 34.98 | 34.39 | 100 | 100 | 0.54 | 1.59 | 5.92 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-05 | 12.88 | 13.45 | 13.16 | 100 | 100 | 0.26 | 2.02 | 3.19 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 100 | 100 | 0.20 | 1.57 | 2.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 100 | 100 | 0.18 | 1.44 | 4.76 |
2015-09-02 | 12.30 | 14.11 | 12.36 | 100 | 100 | -1.10 | -8.17 | 2.40 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 100 | 100 | 0.32 | 2.62 | 3.30 |
643 rows × 8 columns
data.sort_values(by=['open', 'high'], ascending=True)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2015-03-02 | 12.25 | 12.67 | 12.52 | 100 | 100 | 0.32 | 2.62 | 3.30 |
2015-09-02 | 12.30 | 14.11 | 12.36 | 100 | 100 | -1.10 | -8.17 | 2.40 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 100 | 100 | 0.18 | 1.44 | 4.76 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 100 | 100 | 0.20 | 1.57 | 2.30 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 100 | 100 | 0.26 | 2.02 | 3.19 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-06-11 | 33.17 | 34.98 | 34.39 | 100 | 100 | 0.54 | 1.59 | 5.92 |
2017-11-01 | 33.85 | 34.34 | 33.83 | 100 | 100 | -0.61 | -1.77 | 5.81 |
2015-06-10 | 34.10 | 36.35 | 33.85 | 100 | 100 | 0.51 | 1.53 | 9.21 |
2015-06-12 | 34.69 | 35.98 | 35.21 | 100 | 100 | 0.82 | 2.38 | 5.47 |
2015-06-15 | 34.99 | 34.99 | 31.69 | 100 | 100 | -3.52 | -10.00 | 6.82 |
643 rows × 8 columns
# Series 排序因为只有一个特征值,所以不需要传参
data.close.sort_values()
2015-09-02 12.36
2015-03-02 12.52
2015-03-03 12.70
2015-09-07 12.77
2015-03-04 12.90
...
2017-11-01 33.83
2015-06-10 33.85
2015-06-11 34.39
2017-10-31 34.44
2015-06-12 35.21
Name: close, Length: 643, dtype: float64
2.3.2 以索引排序
# DataFrame 使用sort_index 以索引排序
data.sort_index()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2015-03-02 | 12.25 | 12.67 | 12.52 | 100 | 100 | 0.32 | 2.62 | 3.30 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 100 | 100 | 0.18 | 1.44 | 4.76 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 100 | 100 | 0.20 | 1.57 | 2.30 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 100 | 100 | 0.26 | 2.02 | 3.19 |
2015-03-06 | 13.17 | 14.48 | 14.28 | 100 | 100 | 1.12 | 8.51 | 6.16 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2018-02-14 | 21.49 | 21.99 | 21.92 | 100 | 100 | 0.44 | 2.05 | 0.58 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 100 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 100 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
643 rows × 8 columns
# Series 排序
data.high.sort_index()
2015-03-02 12.67
2015-03-03 13.06
2015-03-04 12.92
2015-03-05 13.45
2015-03-06 14.48
...
2018-02-14 21.99
2018-02-22 22.76
2018-02-23 23.37
2018-02-26 23.78
2018-02-27 25.88
Name: high, Length: 643, dtype: float64
3. DataFrame运算
3.1 算数运算
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 100 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 100 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 100 | 100 | 0.44 | 2.05 | 0.58 |
# 推荐使用pd.方法
data['close'].add(100).head()
2018-02-27 124.16
2018-02-26 123.53
2018-02-23 122.82
2018-02-22 122.28
2018-02-14 121.92
Name: close, dtype: float64
# 使用符号运算
(data.close + 100).head()
2018-02-27 124.16
2018-02-26 123.53
2018-02-23 122.82
2018-02-22 122.28
2018-02-14 121.92
Name: close, dtype: float64
data.close.sub(10).head()
2018-02-27 14.16
2018-02-26 13.53
2018-02-23 12.82
2018-02-22 12.28
2018-02-14 11.92
Name: close, dtype: float64
3.2 逻辑运算
3.2.1 逻辑运算符 ( <, > , |, &)
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 100 | 100 | 0.54 | 2.42 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 100 | 100 | 0.36 | 1.64 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 100 | 100 | 0.44 | 2.05 | 0.58 |
# data.open 返回数据 True, False
# data[data.open] 逻辑判断的结果作为筛选依据
data['open'] > 23
2018-02-27 True
2018-02-26 False
2018-02-23 False
2018-02-22 False
2018-02-14 False
...
2015-03-06 False
2015-03-05 False
2015-03-04 False
2015-03-03 False
2015-03-02 False
Name: open, Length: 643, dtype: bool
data[data.open>23].head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 100 | 100 | 0.63 | 2.68 | 2.39 |
2018-02-01 | 23.71 | 23.86 | 22.42 | 100 | 100 | -1.30 | -5.48 | 1.66 |
2018-01-31 | 23.85 | 23.98 | 23.72 | 100 | 100 | -0.11 | -0.46 | 1.23 |
2018-01-30 | 23.71 | 24.08 | 23.83 | 100 | 100 | 0.05 | 0.21 | 0.81 |
2018-01-29 | 24.40 | 24.63 | 23.77 | 100 | 100 | -0.73 | -2.98 | 1.64 |
# 利用与或 (& |)完成逻辑判断
# 优先级问题,多加括号
data[(data.close>23) & (data.close<24)].head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-05 | 22.45 | 23.39 | 23.27 | 100 | 100 | 0.65 | 2.87 | 1.31 |
2018-01-31 | 23.85 | 23.98 | 23.72 | 100 | 100 | -0.11 | -0.46 | 1.23 |
2018-01-30 | 23.71 | 24.08 | 23.83 | 100 | 100 | 0.05 | 0.21 | 0.81 |
2018-01-29 | 24.40 | 24.63 | 23.77 | 100 | 100 | -0.73 | -2.98 | 1.64 |
3.2.2 逻辑运算函数
# query(str) 传入字符串
data.query('close>23 & close<24').head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-26 | 22.80 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-05 | 22.45 | 23.39 | 23.27 | 100 | 100 | 0.65 | 2.87 | 1.31 |
2018-01-31 | 23.85 | 23.98 | 23.72 | 100 | 100 | -0.11 | -0.46 | 1.23 |
2018-01-30 | 23.71 | 24.08 | 23.83 | 100 | 100 | 0.05 | 0.21 | 0.81 |
2018-01-29 | 24.40 | 24.63 | 23.77 | 100 | 100 | -0.73 | -2.98 | 1.64 |
# isin() 可以传一个值, 也可以传一个列表范围, 判断是否在某个范围内
data['open'].isin([22.80, 23.00])
2018-02-27 False
2018-02-26 True
2018-02-23 False
2018-02-22 False
2018-02-14 False
...
2015-03-06 False
2015-03-05 False
2015-03-04 False
2015-03-03 False
2015-03-02 False
Name: open, Length: 643, dtype: bool
data[data['open'].isin([22.80, 23.00])]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
2018-02-26 | 22.8 | 23.78 | 23.53 | 100 | 100 | 0.69 | 3.02 | 1.53 |
2018-02-06 | 22.8 | 23.55 | 22.29 | 100 | 100 | -0.97 | -4.17 | 1.39 |
2017-12-18 | 23.0 | 23.49 | 23.13 | 100 | 100 | 0.12 | 0.52 | 0.74 |
2017-07-24 | 22.8 | 23.79 | 23.03 | 100 | 100 | -0.17 | -0.73 | 2.59 |
2017-06-21 | 23.0 | 23.84 | 23.57 | 100 | 100 | -0.51 | -2.12 | 5.13 |
2016-01-04 | 22.8 | 22.84 | 20.69 | 100 | 100 | -2.28 | -9.93 | 1.60 |
3.3 统计运算
3.3.1 describe()
# describe()方法可以快速的查看DataFrame的整体属性
# 25% - 第一四分位数(Q1),样本中从小到大排列后第25%的数据
# 50% - 中位数
data.describe()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | turnover | |
---|---|---|---|---|---|---|---|---|
count | 643.000000 | 643.000000 | 643.000000 | 643.0 | 643.0 | 643.000000 | 643.000000 | 643.000000 |
mean | 21.272706 | 21.900513 | 21.336267 | 100.0 | 100.0 | 0.018802 | 0.190280 | 2.936190 |
std | 3.930973 | 4.077578 | 3.942806 | 0.0 | 0.0 | 0.898476 | 4.079698 | 2.079375 |
min | 12.250000 | 12.670000 | 12.360000 | 100.0 | 100.0 | -3.520000 | -10.030000 | 0.040000 |
25% | 19.000000 | 19.500000 | 19.045000 | 100.0 | 100.0 | -0.390000 | -1.850000 | 1.360000 |
50% | 21.440000 | 21.970000 | 21.450000 | 100.0 | 100.0 | 0.050000 | 0.260000 | 2.500000 |
75% | 23.400000 | 24.065000 | 23.415000 | 100.0 | 100.0 | 0.455000 | 2.305000 | 3.915000 |
max | 34.990000 | 36.350000 | 35.210000 | 100.0 | 100.0 | 3.030000 | 10.030000 | 12.560000 |
3.3.2 统计函数
# max(), min()
data.max()
open 34.99
high 36.35
close 35.21
low 100.00
volume 100.00
price_change 3.03
p_change 10.03
turnover 12.56
dtype: float64
data.std()
# data.var()
open 3.930973
high 4.077578
close 3.942806
low 0.000000
volume 0.000000
price_change 0.898476
p_change 4.079698
turnover 2.079375
dtype: float64
data.median()
open 21.44
high 21.97
close 21.45
low 100.00
volume 100.00
price_change 0.05
p_change 0.26
turnover 2.50
dtype: float64
# idxmax ( index-max) 最大值的索引值
data.idxmax()
# data,idxmin()
open 2015-06-15
high 2015-06-10
close 2015-06-12
low 2018-02-27
volume 2018-02-27
price_change 2015-06-09
p_change 2015-08-28
turnover 2017-10-26
dtype: object
3.4 累计统计函数
# 常见累计统计函数为:
# cumsum - 累加
# cummax - 累计取最大值, 新的最大值替换原来的最大值
# cummin - 累计取最小值
# cumprod - 累积
data = data.sort_index()
data.p_change
2015-03-02 2.62
2015-03-03 1.44
2015-03-04 1.57
2015-03-05 2.02
2015-03-06 8.51
...
2018-02-14 2.05
2018-02-22 1.64
2018-02-23 2.42
2018-02-26 3.02
2018-02-27 2.68
Name: p_change, Length: 643, dtype: float64
data.p_change.cumsum()
2015-03-02 2.62
2015-03-03 4.06
2015-03-04 5.63
2015-03-05 7.65
2015-03-06 16.16
...
2018-02-14 112.59
2018-02-22 114.23
2018-02-23 116.65
2018-02-26 119.67
2018-02-27 122.35
Name: p_change, Length: 643, dtype: float64
# 利用Pandas自带的绘图功能, 需要运行2次才能出结果
data.p_change.cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ecb2848>
data.p_change.cummax().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ff73a88>
3.5 自定义函数
# apply(func), fun - lambda函数
data[['open']] # [[]]取出DataFrame
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | |
---|---|
2015-03-02 | 12.25 |
2015-03-03 | 12.52 |
2015-03-04 | 12.80 |
2015-03-05 | 12.88 |
2015-03-06 | 13.17 |
... | ... |
2018-02-14 | 21.49 |
2018-02-22 | 22.25 |
2018-02-23 | 22.88 |
2018-02-26 | 22.80 |
2018-02-27 | 23.53 |
643 rows × 1 columns
data[['open']].apply(lambda x: x.max()-x.min()) # 默认axis=0
open 22.74
dtype: float64
4. Pandas内置画图
# DataFrame(x, y, kind='line')
# kind: 绘图的类型, line, bar, barh, hist, pie, scatter
# DataFrame
data['open'].plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x1d54ffe6c88>
5. 文件读取与存储
5.1 csv文件
# usecols -abs 读取特定列,列表形式传入
# sep=',' 分隔
# 读取文件
data = pd.read_csv('./data/stock_day.csv', usecols=['open', 'high', 'low'], sep=',')
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | low | |
---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 23.53 |
2018-02-26 | 22.80 | 23.78 | 22.80 |
2018-02-23 | 22.88 | 23.37 | 22.71 |
2018-02-22 | 22.25 | 22.76 | 22.02 |
2018-02-14 | 21.49 | 21.99 | 21.48 |
... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 13.13 |
2015-03-05 | 12.88 | 13.45 | 12.87 |
2015-03-04 | 12.80 | 12.92 | 12.61 |
2015-03-03 | 12.52 | 13.06 | 12.52 |
2015-03-02 | 12.25 | 12.67 | 12.20 |
643 rows × 3 columns
# 存储文件
# columns :存储指定列,
# index:是否存储index
data[:10].to_csv('./data/test_write_in.csv',columns=['high', 'low'], index=False)
5.2 hdf文件
# hdf文件格式是官方推荐的格式,存储读取速度快
# 压缩方式读取速度快,节省空间
# 支持跨平台
day_eps = pd.read_hdf('./data/stock_data/day/day_close.h5')
# 需要安装tables模块才能显示
# hdf文件不能直接打开,需要导入后才能打开
day_eps
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
000001.SZ | 000002.SZ | 000004.SZ | 000005.SZ | 000006.SZ | 000007.SZ | 000008.SZ | 000009.SZ | 000010.SZ | 000011.SZ | ... | 001965.SZ | 603283.SH | 002920.SZ | 002921.SZ | 300684.SZ | 002922.SZ | 300735.SZ | 603329.SH | 603655.SH | 603080.SH | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16.30 | 17.71 | 4.58 | 2.88 | 14.60 | 2.62 | 4.96 | 4.66 | 5.37 | 6.02 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 17.02 | 19.20 | 4.65 | 3.02 | 15.97 | 2.65 | 4.95 | 4.70 | 5.37 | 6.27 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 17.02 | 17.28 | 4.56 | 3.06 | 14.37 | 2.63 | 4.82 | 4.47 | 5.37 | 5.96 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 16.18 | 16.97 | 4.49 | 2.95 | 13.10 | 2.73 | 4.89 | 4.33 | 5.37 | 5.77 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 16.95 | 17.19 | 4.55 | 2.99 | 13.18 | 2.77 | 4.97 | 4.42 | 5.37 | 5.92 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2673 | 12.96 | 35.99 | 22.84 | 4.37 | 9.85 | 16.66 | 8.47 | 7.52 | 6.20 | 17.88 | ... | 12.99 | 23.42 | 47.99 | 32.40 | 22.45 | 28.79 | 23.18 | 24.45 | 14.98 | 26.06 |
2674 | 13.08 | 35.84 | 23.02 | 4.41 | 9.85 | 16.66 | 8.49 | 7.48 | 6.01 | 17.75 | ... | 12.83 | 25.76 | 45.14 | 35.64 | 24.70 | 31.67 | 25.50 | 26.90 | 16.48 | 28.67 |
2675 | 13.47 | 35.67 | 22.40 | 4.32 | 9.85 | 16.66 | 8.49 | 7.38 | 5.97 | 17.45 | ... | 12.20 | 28.34 | 43.21 | 39.20 | 27.17 | 34.84 | 28.05 | 29.59 | 18.13 | 31.54 |
2676 | 13.40 | 35.15 | 22.29 | 4.29 | 9.85 | 16.66 | 8.56 | 7.04 | 5.84 | 17.49 | ... | 12.11 | 31.17 | 43.76 | 40.88 | 29.89 | 34.84 | 29.64 | 32.55 | 19.94 | 34.69 |
2677 | 13.55 | 35.55 | 22.20 | 4.37 | 9.85 | 16.66 | 8.67 | 7.06 | 5.99 | 17.76 | ... | 11.91 | 34.29 | 41.71 | 39.10 | 32.88 | 34.84 | 27.92 | 31.82 | 21.93 | 38.16 |
2678 rows × 3562 columns
# 存储格式为 .h5
day_eps_test = day_eps.to_hdf('./data/day_eps_test.h5', key='day_eps')
pd.read_hdf('./data/day_eps_test.h5')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
000001.SZ | 000002.SZ | 000004.SZ | 000005.SZ | 000006.SZ | 000007.SZ | 000008.SZ | 000009.SZ | 000010.SZ | 000011.SZ | ... | 001965.SZ | 603283.SH | 002920.SZ | 002921.SZ | 300684.SZ | 002922.SZ | 300735.SZ | 603329.SH | 603655.SH | 603080.SH | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16.30 | 17.71 | 4.58 | 2.88 | 14.60 | 2.62 | 4.96 | 4.66 | 5.37 | 6.02 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 17.02 | 19.20 | 4.65 | 3.02 | 15.97 | 2.65 | 4.95 | 4.70 | 5.37 | 6.27 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 17.02 | 17.28 | 4.56 | 3.06 | 14.37 | 2.63 | 4.82 | 4.47 | 5.37 | 5.96 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 16.18 | 16.97 | 4.49 | 2.95 | 13.10 | 2.73 | 4.89 | 4.33 | 5.37 | 5.77 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 16.95 | 17.19 | 4.55 | 2.99 | 13.18 | 2.77 | 4.97 | 4.42 | 5.37 | 5.92 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2673 | 12.96 | 35.99 | 22.84 | 4.37 | 9.85 | 16.66 | 8.47 | 7.52 | 6.20 | 17.88 | ... | 12.99 | 23.42 | 47.99 | 32.40 | 22.45 | 28.79 | 23.18 | 24.45 | 14.98 | 26.06 |
2674 | 13.08 | 35.84 | 23.02 | 4.41 | 9.85 | 16.66 | 8.49 | 7.48 | 6.01 | 17.75 | ... | 12.83 | 25.76 | 45.14 | 35.64 | 24.70 | 31.67 | 25.50 | 26.90 | 16.48 | 28.67 |
2675 | 13.47 | 35.67 | 22.40 | 4.32 | 9.85 | 16.66 | 8.49 | 7.38 | 5.97 | 17.45 | ... | 12.20 | 28.34 | 43.21 | 39.20 | 27.17 | 34.84 | 28.05 | 29.59 | 18.13 | 31.54 |
2676 | 13.40 | 35.15 | 22.29 | 4.29 | 9.85 | 16.66 | 8.56 | 7.04 | 5.84 | 17.49 | ... | 12.11 | 31.17 | 43.76 | 40.88 | 29.89 | 34.84 | 29.64 | 32.55 | 19.94 | 34.69 |
2677 | 13.55 | 35.55 | 22.20 | 4.37 | 9.85 | 16.66 | 8.67 | 7.06 | 5.99 | 17.76 | ... | 11.91 | 34.29 | 41.71 | 39.10 | 32.88 | 34.84 | 27.92 | 31.82 | 21.93 | 38.16 |
2678 rows × 3562 columns
5.3 json文件
# oritent: 读取方式
# lines: 是否按行读取
json_read = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", orient="records", lines=True)
json_read
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
article_link | headline | is_sarcastic | |
---|---|---|---|
0 | https://www.huffingtonpost.com/entry/versace-b... | former versace store clerk sues over secret 'b... | 0 |
1 | https://www.huffingtonpost.com/entry/roseanne-... | the 'roseanne' revival catches up to our thorn... | 0 |
2 | https://local.theonion.com/mom-starting-to-fea... | mom starting to fear son's web series closest ... | 1 |
3 | https://politics.theonion.com/boehner-just-wan... | boehner just wants wife to listen, not come up... | 1 |
4 | https://www.huffingtonpost.com/entry/jk-rowlin... | j.k. rowling wishes snape happy birthday in th... | 0 |
... | ... | ... | ... |
26704 | https://www.huffingtonpost.com/entry/american-... | american politics in moral free-fall | 0 |
26705 | https://www.huffingtonpost.com/entry/americas-... | america's best 20 hikes | 0 |
26706 | https://www.huffingtonpost.com/entry/reparatio... | reparations and obama | 0 |
26707 | https://www.huffingtonpost.com/entry/israeli-b... | israeli ban targeting boycott supporters raise... | 0 |
26708 | https://www.huffingtonpost.com/entry/gourmet-g... | gourmet gifts for the foodie 2014 | 0 |
26709 rows × 3 columns
# lines 表示存储数据分行, 否则全部为一整行
json_read.to_json('./data/test.json', orient='records', lines=True)
6.高级处理
6.1 处理缺失值
# 缺失值一般使用nan(not a number)来表示
type(np.nan)
float
# 导入数据
movie = pd.read_csv('./data/IMDB-Movie-Data.csv')
movie.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Guardians of the Galaxy | Action,Adventure,Sci-Fi | A group of intergalactic criminals are forced ... | James Gunn | Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... | 2014 | 121 | 8.1 | 757074 | 333.13 | 76.0 |
1 | 2 | Prometheus | Adventure,Mystery,Sci-Fi | Following clues to the origin of mankind, a te... | Ridley Scott | Noomi Rapace, Logan Marshall-Green, Michael Fa... | 2012 | 124 | 7.0 | 485820 | 126.46 | 65.0 |
2 | 3 | Split | Horror,Thriller | Three girls are kidnapped by a man with a diag... | M. Night Shyamalan | James McAvoy, Anya Taylor-Joy, Haley Lu Richar... | 2016 | 117 | 7.3 | 157606 | 138.12 | 62.0 |
3 | 4 | Sing | Animation,Comedy,Family | In a city of humanoid animals, a hustling thea... | Christophe Lourdelet | Matthew McConaughey,Reese Witherspoon, Seth Ma... | 2016 | 108 | 7.2 | 60545 | 270.32 | 59.0 |
4 | 5 | Suicide Squad | Action,Adventure,Fantasy | A secret government agency recruits some of th... | David Ayer | Will Smith, Jared Leto, Margot Robbie, Viola D... | 2016 | 123 | 6.2 | 393727 | 325.02 | 40.0 |
# 判断缺失值是否存在
# isnull() :nan - True
# notnull():nan - False
pd.isnull(movie)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | False | False |
1 | False | False | False | False | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | False | False | False | False | False | False | False | False | False | False | True | False |
996 | False | False | False | False | False | False | False | False | False | False | False | False |
997 | False | False | False | False | False | False | False | False | False | False | False | False |
998 | False | False | False | False | False | False | False | False | False | False | True | False |
999 | False | False | False | False | False | False | False | False | False | False | False | False |
1000 rows × 12 columns
np.any(pd.isnull(movie)) # 其中有任何一个为True(nan值存在), 则返回True
True
np.all(pd.notnull(movie)) # 所有的元素都非nan
False
6.1.1 丢弃缺失值
# 直接丢弃含有nan的一行数据
data = movie.dropna()
np.any(pd.isnull(data))
False
6.1.2 替换缺失值 (常见:平均值或者0)
# 使用平均值替换
# inplace=True , 表示直接对原来movie值进行修改
data = movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean())
# inplace默认为False, 返回了新的替换后的一个data数据, 原来的movie中仍含有nan
np.any(pd.isnull(movie['Revenue (Millions)']))
True
movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean(), inplace=True)
# movie['Revenue (Millions)'] 中的nan 已经被替换
np.any(pd.isnull(movie['Revenue (Millions)']))
False
6.1.3 缺失值不是nan
# 全局取消证书验证
# 读取数据
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
# 先将? 数据替换成nan
# 再对nan进行处理
wis
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1000025 | 5 | 1 | 1.1 | 1.2 | 2 | 1.3 | 3 | 1.4 | 1.5 | 2.1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1002945 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
1 | 1015425 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
2 | 1016277 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
3 | 1017023 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
4 | 1017122 | 8 | 10 | 10 | 8 | 7 | 10 | 9 | 7 | 1 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
693 | 776715 | 3 | 1 | 1 | 1 | 3 | 2 | 1 | 1 | 1 | 2 |
694 | 841769 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 2 |
695 | 888820 | 5 | 10 | 10 | 3 | 7 | 3 | 8 | 10 | 2 | 4 |
696 | 897471 | 4 | 8 | 6 | 4 | 3 | 4 | 10 | 6 | 1 | 4 |
697 | 897471 | 4 | 8 | 8 | 5 | 4 | 5 | 10 | 4 | 1 | 4 |
698 rows × 11 columns
# to_replace: 被替换的值, value:去替换的值
wis = wis.replace(to_replace='?', value=np.nan)
wis = wis.dropna()
wis
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1000025 | 5 | 1 | 1.1 | 1.2 | 2 | 1.3 | 3 | 1.4 | 1.5 | 2.1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1002945 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
1 | 1015425 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
2 | 1016277 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
3 | 1017023 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
4 | 1017122 | 8 | 10 | 10 | 8 | 7 | 10 | 9 | 7 | 1 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
693 | 776715 | 3 | 1 | 1 | 1 | 3 | 2 | 1 | 1 | 1 | 2 |
694 | 841769 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 2 |
695 | 888820 | 5 | 10 | 10 | 3 | 7 | 3 | 8 | 10 | 2 | 4 |
696 | 897471 | 4 | 8 | 6 | 4 | 3 | 4 | 10 | 6 | 1 | 4 |
697 | 897471 | 4 | 8 | 8 | 5 | 4 | 5 | 10 | 4 | 1 | 4 |
682 rows × 11 columns
6.2 数据离散化
# 数据离散化可以简化数据结构,将数据划分到若干离散的区间,可以简化数据结构, 常用于搭配one-hot编码
# 获取数据
data = pd.read_csv("./data/stock_day.csv")
data_p= data['p_change']
data_p
2018-02-27 2.68
2018-02-26 3.02
2018-02-23 2.42
2018-02-22 1.64
2018-02-14 2.05
...
2015-03-06 8.51
2015-03-05 2.02
2015-03-04 1.57
2015-03-03 1.44
2015-03-02 2.62
Name: p_change, Length: 643, dtype: float64
# pd.qcut() 智能分组
# q: 分组数量
q_cut = pd.qcut(data_p, q=10)
q_cut
2018-02-27 (1.738, 2.938]
2018-02-26 (2.938, 5.27]
2018-02-23 (1.738, 2.938]
2018-02-22 (0.94, 1.738]
2018-02-14 (1.738, 2.938]
...
2015-03-06 (5.27, 10.03]
2015-03-05 (1.738, 2.938]
2015-03-04 (0.94, 1.738]
2015-03-03 (0.94, 1.738]
2015-03-02 (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
# value_counts(): 每个分组区间内的数据数量
q_cut.value_counts()
(5.27, 10.03] 65
(0.26, 0.94] 65
(-0.462, 0.26] 65
(-10.030999999999999, -4.836] 65
(2.938, 5.27] 64
(1.738, 2.938] 64
(-1.352, -0.462] 64
(-2.444, -1.352] 64
(-4.836, -2.444] 64
(0.94, 1.738] 63
Name: p_change, dtype: int64
# pd.cut(data, bins): 自己指定分组区间
bins = [-100, -7, -5, -3, 0, 3, 5, 7, 100]
cut = pd.cut(data_p, bins=bins)
cut
2018-02-27 (0, 3]
2018-02-26 (3, 5]
2018-02-23 (0, 3]
2018-02-22 (0, 3]
2018-02-14 (0, 3]
...
2015-03-06 (7, 100]
2015-03-05 (0, 3]
2015-03-04 (0, 3]
2015-03-03 (0, 3]
2015-03-02 (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
cut.value_counts()
(0, 3] 215
(-3, 0] 188
(3, 5] 57
(-5, -3] 51
(7, 100] 35
(5, 7] 35
(-100, -7] 34
(-7, -5] 28
Name: p_change, dtype: int64
# get_dummies() 取独热矩阵
pd.get_dummies(q_cut)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
(-10.030999999999999, -4.836] | (-4.836, -2.444] | (-2.444, -1.352] | (-1.352, -0.462] | (-0.462, 0.26] | (0.26, 0.94] | (0.94, 1.738] | (1.738, 2.938] | (2.938, 5.27] | (5.27, 10.03] | |
---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2018-02-26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2018-02-23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2018-02-22 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2018-02-14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2015-03-05 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2015-03-04 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2015-03-03 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2015-03-02 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
643 rows × 10 columns
data_dummy = pd.get_dummies(q_cut)
6.3 数据拼接
6.3.1 pd.concat()
# 不指定axis 可能会造成拼接错位,产生很多nan
pd.concat([data, data_dummy], axis=1)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | ma5 | ma10 | ma20 | ... | (-10.030999999999999, -4.836] | (-4.836, -2.444] | (-2.444, -1.352] | (-1.352, -0.462] | (-0.462, 0.26] | (0.26, 0.94] | (0.94, 1.738] | (1.738, 2.938] | (2.938, 5.27] | (5.27, 10.03] | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 22.942 | 22.142 | 22.875 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 22.406 | 21.955 | 22.942 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 21.938 | 21.929 | 23.022 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 21.446 | 21.909 | 23.137 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 21.366 | 21.923 | 23.253 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 13.112 | 13.112 | 13.112 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 12.820 | 12.820 | 12.820 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 12.707 | 12.707 | 12.707 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 12.610 | 12.610 | 12.610 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 12.520 | 12.520 | 12.520 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
643 rows × 24 columns
6.3.2 pd.merge()
# 获取数据
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
# 默认内连接
# on: 以什么作为键来拼接
result = pd.merge(left, right, on=['key1', 'key2'])
# 内连接就是取交集
result
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
# 外连接
pd.merge(left, right, on=['key1', 'key2'], how='outer')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
5 | K2 | K0 | NaN | NaN | C3 | D3 |
# 左连接
pd.merge(left, right, on=['key1', 'key2'], how='left')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
# 右连接
pd.merge(left, right, on=['key1', 'key2'], how='right')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
3 | K2 | K0 | NaN | NaN | C3 | D3 |
6.4 交叉表和透视表
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | ma5 | ma10 | ma20 | v_ma5 | v_ma10 | v_ma20 | turnover | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 22.942 | 22.142 | 22.875 | 53782.64 | 46738.65 | 55576.11 | 2.39 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 22.406 | 21.955 | 22.942 | 40827.52 | 42736.34 | 56007.50 | 1.53 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 21.938 | 21.929 | 23.022 | 35119.58 | 41871.97 | 56372.85 | 1.32 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 21.446 | 21.909 | 23.137 | 35397.58 | 39904.78 | 60149.60 | 0.90 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 21.366 | 21.923 | 23.253 | 33590.21 | 42935.74 | 61716.11 | 0.58 |
# 将date.index 转化为datetime格式
date = pd.to_datetime(data.index).weekday
data['week'] = date
data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | ma5 | ma10 | ma20 | v_ma5 | v_ma10 | v_ma20 | turnover | posi_neg | week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 22.942 | 22.142 | 22.875 | 53782.64 | 46738.65 | 55576.11 | 2.39 | 1 | 1 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 22.406 | 21.955 | 22.942 | 40827.52 | 42736.34 | 56007.50 | 1.53 | 1 | 0 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 21.938 | 21.929 | 23.022 | 35119.58 | 41871.97 | 56372.85 | 1.32 | 1 | 4 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 21.446 | 21.909 | 23.137 | 35397.58 | 39904.78 | 60149.60 | 0.90 | 1 | 3 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 21.366 | 21.923 | 23.253 | 33590.21 | 42935.74 | 61716.11 | 0.58 | 1 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-03-06 | 13.17 | 14.48 | 14.28 | 13.13 | 179831.72 | 1.12 | 8.51 | 13.112 | 13.112 | 13.112 | 115090.18 | 115090.18 | 115090.18 | 6.16 | 1 | 4 |
2015-03-05 | 12.88 | 13.45 | 13.16 | 12.87 | 93180.39 | 0.26 | 2.02 | 12.820 | 12.820 | 12.820 | 98904.79 | 98904.79 | 98904.79 | 3.19 | 1 | 3 |
2015-03-04 | 12.80 | 12.92 | 12.90 | 12.61 | 67075.44 | 0.20 | 1.57 | 12.707 | 12.707 | 12.707 | 100812.93 | 100812.93 | 100812.93 | 2.30 | 1 | 2 |
2015-03-03 | 12.52 | 13.06 | 12.70 | 12.52 | 139071.61 | 0.18 | 1.44 | 12.610 | 12.610 | 12.610 | 117681.67 | 117681.67 | 117681.67 | 4.76 | 1 | 1 |
2015-03-02 | 12.25 | 12.67 | 12.52 | 12.20 | 96291.73 | 0.32 | 2.62 | 12.520 | 12.520 | 12.520 | 96291.73 | 96291.73 | 96291.73 | 3.30 | 1 | 0 |
643 rows × 16 columns
# 把p_change 划分为0, 1两类
data['posi_neg'] = np.where(data['p_change']>0, 1, 0)
data.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
open | high | close | low | volume | price_change | p_change | ma5 | ma10 | ma20 | v_ma5 | v_ma10 | v_ma20 | turnover | posi_neg | week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2018-02-27 | 23.53 | 25.88 | 24.16 | 23.53 | 95578.03 | 0.63 | 2.68 | 22.942 | 22.142 | 22.875 | 53782.64 | 46738.65 | 55576.11 | 2.39 | 1 | 1 |
2018-02-26 | 22.80 | 23.78 | 23.53 | 22.80 | 60985.11 | 0.69 | 3.02 | 22.406 | 21.955 | 22.942 | 40827.52 | 42736.34 | 56007.50 | 1.53 | 1 | 0 |
2018-02-23 | 22.88 | 23.37 | 22.82 | 22.71 | 52914.01 | 0.54 | 2.42 | 21.938 | 21.929 | 23.022 | 35119.58 | 41871.97 | 56372.85 | 1.32 | 1 | 4 |
2018-02-22 | 22.25 | 22.76 | 22.28 | 22.02 | 36105.01 | 0.36 | 1.64 | 21.446 | 21.909 | 23.137 | 35397.58 | 39904.78 | 60149.60 | 0.90 | 1 | 3 |
2018-02-14 | 21.49 | 21.99 | 21.92 | 21.48 | 23331.04 | 0.44 | 2.05 | 21.366 | 21.923 | 23.253 | 33590.21 | 42935.74 | 61716.11 | 0.58 | 1 | 2 |
# 手动构造交叉表
count = pd.crosstab(data['week'], data['posi_neg'])
sum = count.sum(axis=1).astype(np.float32)
pro = count.div(sum, axis=0)
pro.plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1d55bc28408>
# 自动构造交叉表
data.pivot_table(['posi_neg'], index='week')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
posi_neg | |
---|---|
week | |
0 | 0.496000 |
1 | 0.580153 |
2 | 0.537879 |
3 | 0.507812 |
4 | 0.535433 |
6.5 分组和聚合
6.5.1 pd.groupby()
# 创建数据
col =pd.DataFrame({'color': ['white','red','green','red','green'], 'object': ['pen','pencil','pencil','ashtray','pen'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})
col
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
color | object | price1 | price2 | |
---|---|---|---|---|
0 | white | pen | 5.56 | 4.75 |
1 | red | pencil | 4.20 | 4.12 |
2 | green | pencil | 1.30 | 1.60 |
3 | red | ashtray | 0.56 | 0.75 |
4 | green | pen | 2.75 | 3.15 |
# DataFrame 分组, 推荐使用
# 单独的分组没有意义,进行聚合(求值)才有价值
col.groupby(['color'])['price1'].mean()
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
# 如果设置as_index= False 会创建一列新索引
col.groupby(['color'], as_index= False)['price1'].mean()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
color | price1 | |
---|---|---|
0 | green | 2.025 |
1 | red | 2.380 |
2 | white | 5.560 |
col.price1
0 5.56
1 4.20
2 1.30
3 0.56
4 2.75
Name: price1, dtype: float64
# Series 分组
col.price1.groupby(col['color']).mean()
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
6.5.2 分组实例
# 获取数据
starbucks = pd.read_csv("./data/starbucks/directory.csv")
starbucks.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Brand | Store Number | Store Name | Ownership Type | Street Address | City | State/Province | Country | Postcode | Phone Number | Timezone | Longitude | Latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Starbucks | 47370-257954 | Meritxell, 96 | Licensed | Av. Meritxell, 96 | Andorra la Vella | 7 | AD | AD500 | 376818720 | GMT+1:00 Europe/Andorra | 1.53 | 42.51 |
1 | Starbucks | 22331-212325 | Ajman Drive Thru | Licensed | 1 Street 69, Al Jarf | Ajman | AJ | AE | NaN | NaN | GMT+04:00 Asia/Dubai | 55.47 | 25.42 |
2 | Starbucks | 47089-256771 | Dana Mall | Licensed | Sheikh Khalifa Bin Zayed St. | Ajman | AJ | AE | NaN | NaN | GMT+04:00 Asia/Dubai | 55.47 | 25.39 |
3 | Starbucks | 22126-218024 | Twofour 54 | Licensed | Al Salam Street | Abu Dhabi | AZ | AE | NaN | NaN | GMT+04:00 Asia/Dubai | 54.38 | 24.48 |
4 | Starbucks | 17127-178586 | Al Ain Tower | Licensed | Khaldiya Area, Abu Dhabi Island | Abu Dhabi | AZ | AE | NaN | NaN | GMT+04:00 Asia/Dubai | 54.54 | 24.51 |
# 以一个值为分组依据
starbucks.groupby(['Country']).count()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Brand | Store Number | Store Name | Ownership Type | Street Address | City | State/Province | Postcode | Phone Number | Timezone | Longitude | Latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Country | ||||||||||||
AD | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
AE | 144 | 144 | 144 | 144 | 144 | 144 | 144 | 24 | 78 | 144 | 144 | 144 |
AR | 108 | 108 | 108 | 108 | 108 | 108 | 108 | 100 | 29 | 108 | 108 | 108 |
AT | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 17 | 18 | 18 | 18 |
AU | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 0 | 22 | 22 | 22 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
TT | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 3 |
TW | 394 | 394 | 394 | 394 | 394 | 394 | 394 | 365 | 39 | 394 | 394 | 394 |
US | 13608 | 13608 | 13608 | 13608 | 13608 | 13608 | 13608 | 13607 | 13122 | 13608 | 13608 | 13608 |
VN | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 23 | 25 | 25 | 25 |
ZA | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 2 | 3 | 3 | 3 |
73 rows × 12 columns
starbucks_count = starbucks.groupby(['Country']).count()
starbucks_count['Brand'].plot(kind='bar', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1d563dbd148>
# 为了阅读方便,对数据进行排序后画图
starbucks_count.sort_values(by='Brand', ascending=False).head(20)['Brand'].plot(kind='bar', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1d55ca68f48>
# 多种分组依据
starbucks.groupby(['Country', 'State/Province']).count().head(20)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Brand | Store Number | Store Name | Ownership Type | Street Address | City | Postcode | Phone Number | Timezone | Longitude | Latitude | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Country | State/Province | |||||||||||
AD | 7 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
AE | AJ | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 0 | 2 | 2 | 2 |
AZ | 48 | 48 | 48 | 48 | 48 | 48 | 7 | 20 | 48 | 48 | 48 | |
DU | 82 | 82 | 82 | 82 | 82 | 82 | 16 | 50 | 82 | 82 | 82 | |
FU | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 0 | 2 | 2 | 2 | |
RK | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 3 | 3 | |
SH | 6 | 6 | 6 | 6 | 6 | 6 | 0 | 5 | 6 | 6 | 6 | |
UQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | |
AR | B | 21 | 21 | 21 | 21 | 21 | 21 | 18 | 5 | 21 | 21 | 21 |
C | 73 | 73 | 73 | 73 | 73 | 73 | 71 | 24 | 73 | 73 | 73 | |
M | 5 | 5 | 5 | 5 | 5 | 5 | 2 | 0 | 5 | 5 | 5 | |
S | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 3 | |
X | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 0 | 6 | 6 | 6 | |
AT | 3 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | |
9 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 13 | 14 | 14 | 14 | |
AU | NSW | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 0 | 9 | 9 | 9 |
QLD | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 0 | 8 | 8 | 8 | |
VIC | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 0 | 5 | 5 | 5 | |
AW | AW | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 3 | 3 |
Pandas 精简实例入门的更多相关文章
- Matplotlib 精简实例入门
Matplotlob 简明实例入门 通过几个实例,快速了解matplotlib.pyplot 中最为常见的折线图,散点图,柱状图,直方图,饼图的用法 如果您需要更为详细的内容,请参考官方文档: htt ...
- XML实例入门2
工具:notepad++.VS2008(MSXML6.0) 来自msdn的例子(经过修改,因为升级到MSXML6.0,有些关键字不太一样了), 需要文件books.xml,books.vsd(博客只支 ...
- Pandas系列之入门篇
Pandas系列之入门篇 简介 pandas 是 python用来数据清洗.分析的包,可以使用类sql的语法方便的进行数据关联.查询,属于内存计算范畴, 效率远远高于硬盘计算的数据库存储.另外pand ...
- Kivy 中文教程 实例入门 简易画板 (Simple Paint App):1. 自定义窗口部件 (widget)
1. 框架代码 用 PyCharm 新建一个名为 SimplePaintApp 的项目,然后新建一个名为 simple_paint_app.py 的 Python 源文件, 在代码编辑器中,输入以下框 ...
- Kivy crash 中文教程 实例入门 1. 第1个应用 Kivy App (Making a simple App)
1. 空白窗口 在 PyCharm 中创建一个名为 TutorialApp 的项目,然后在该项目中新建了个名为 tutorial_app.py 的 Python 源文件,在 PyCharm 的代码编 ...
- Kivy 中文教程 实例入门 简易画板 (Simple Paint App):3. 随机颜色及清除按钮
1. 随机颜色 通过前面的教程,咪博士已经带大家实现了画板的绘图功能.但是,现在画板只能画出黄色的图案,还十分单调,接下来咪博士就教大家,如何使用随机颜色,让画板变得五彩斑斓. 改进后的代码如下: f ...
- Kivy 中文教程 实例入门 简易画板 (Simple Paint App):2. 实现绘图功能
1. 理解 kivy 坐标系统 上一节中,咪博士带大家实现了画板程序的基础框架,以及一个基本的自定义窗口部件(widget).在上一节的末尾,咪博士留了一道关于 kivy 坐标系统的思考题给大家.通过 ...
- Kivy 中文教程 实例入门 简易画板 (Simple Paint App):0. 项目简介 & 成果展示
本教程咪博士将带领大家学习创建自己的窗口部件 (widget).最终,我们完成的作品是一个简易的画板程序. 当用 kivy 创建应用时,我们需要仔细思考以下 3 个问题: 我们创建的应用需要处理什么数 ...
- React实例入门教程(1)基础API,JSX语法--hello world
前 言 毫无疑问,react是目前最最热门的框架(没有之一),了解并学习使用React,可以说是现在每个前端工程师都需要的. 在前端领域,一个框架为何会如此之火爆,无外乎两个原因:性能优秀,开发 ...
随机推荐
- CountDownLatch源码探究 (JDK 1.8)
CountDownLatch能够实现让线程等待某个计数器倒数到零的功能,之前对它的了解也仅仅是简单的使用,对于其内部如何实现线程等待却不是很了解,最好的办法就是通过看源码来了解底层的实现细节.Coun ...
- spring——AOP原理及源码(一)
教程共分为五篇,从AOP实例的构建及其重要组件.基本运行流程.容器创建流程.关键方法调用.原理总结归纳等几个方面一步步走进AOP的世界. 本篇主要为读者演示构建AOP实例及AOP核心组件分析. 一.项 ...
- Web中间件常见漏洞总结
一.IIS中间组件: 1.PUT漏洞 2.短文件名猜解 3.远程代码执行 4.解析漏洞 二.Apache中间组件: 1.解析漏洞 2.目录遍历 三.Nginx中间组件: 1.文件解析 2.目录遍历 3 ...
- 车道线检测LaneNet
LaneNet LanNet Segmentation branch 完成语义分割,即判断出像素属于车道or背景 Embedding branch 完成像素的向量表示,用于后续聚类,以完成实例分割 H ...
- 【Spring Data 系列学习】Spring Data JPA 基础查询
[Spring Data 系列学习]Spring Data JPA 基础查询 前面的章节简单讲解了 了解 Spring Data JPA . Jpa 和 Hibernate,本章节开始通过案例上手 S ...
- ajax+lazyload时lazyload失效问题及解决
最近写公司的项目的时候遇到一个关于图片加载的问题,所做的页面是一个商城的商品列表页,其中需要显示商品图片,名称等信息,因为商品列表可能会很长,所以其中图片需要滑到可以显示的区域再进行加载. 首先我的图 ...
- 前端进阶系列(三):HTML5新特性
HTML5 是对 HTML 标准的第五次修订.其主要的目标是将互联网语义化,以便更好地被人类和机器阅读,并同时提供更好地支持各种媒体的嵌入.HTML5 的语法是向后兼容的.现在国内普遍说的 H5 是包 ...
- web 移动端 横向滚动的阻尼感很强,滑动不灵敏
在添加 overflow-x: scroll的元素里增加如下style overflow-x: scroll; -webkit-overflow-scrolling: touch; //关键点
- 使用 Redis 如何实现查询附近的人?「视频版」——面试突击 003 期
面试问题 Redis 如何实现查询附近的人? 涉及知识点 Redis 中如何操作位置信息? GEO 底层是如何实现的? 如何在程序实现查询附近的人? 在实际使用中需要注意哪些问题? 视频答案 视频地址 ...
- Python3爬虫使用requests爬取lol英雄皮肤
本人博客:https://xiaoxiablogs.top 此次爬取lol英雄皮肤一共有两个版本,分别是多线程版本和非多线程版本. 多线程版本 # !/usr/bin/env python # -*- ...