pandas 使用总结

import pandas as pd

import numpy as np

## 从字典初始化df

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',

         'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],

         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],

         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],

         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

df = pd.DataFrame(ipl_data)

print(df)

      Team  Rank  Year  Points

0   Riders     1  2014     876

1   Riders     2  2015     789

2   Devils     2  2014     863

3   Devils     3  2015     673

4    Kings     3  2014     741

5    Kings     4  2015     812

6    Kings     1  2016     756

7    Kings     1  2017     788

8   Riders     2  2016     694

9   Royals     4  2014     701

10  Royals     1  2015     804

11  Riders     2  2017     690

print(df.groupby('Team')) ##　groupby 返回的对象

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fcbff80a240>

print(df.groupby('Team').groups) ##用groups属性来进行查看每个分组

{'Devils': Int64Index([2, 3], dtype='int64'), 'Kings': Int64Index([4, 5, 6, 7], dtype='int64'), 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'), 'Royals': Int64Index([9, 10], dtype='int64')}

## 对groupby 后的结果进行遍历

grouped = df.groupby('Year')

for name,group in grouped:

    print(name)

    print(group)

2014

     Team  Rank  Year  Points

0  Riders     1  2014     876

2  Devils     2  2014     863

4   Kings     3  2014     741

9  Royals     4  2014     701

2015

      Team  Rank  Year  Points

1   Riders     2  2015     789

3   Devils     3  2015     673

5    Kings     4  2015     812

10  Royals     1  2015     804

2016

     Team  Rank  Year  Points

6   Kings     1  2016     756

8  Riders     2  2016     694

2017

      Team  Rank  Year  Points

7    Kings     1  2017     788

11  Riders     2  2017     690

## 从多个groups中获取单个group

grouped = df.groupby('Year')

print(grouped.get_group(2014))

     Team  Rank  Year  Points

0  Riders     1  2014     876

2  Devils     2  2014     863

4   Kings     3  2014     741

9  Royals     4  2014     701

## 使用agg聚合函数计算均值

grouped = df.groupby('Year')

print(grouped['Points'].agg('mean'))

Year

2014    795.25

2015    769.50

2016    725.00

2017    739.00

Name: Points, dtype: float64

## 使用agg聚合函数计算数据条数

grouped = df.groupby('Team')

print(grouped.agg(np.size))

        Rank  Year  Points

Team

Devils     2     2       2

Kings      4     4       4

Riders     4     4       4

Royals     2     2       2

## 使用多个agg聚合函数进行计算

grouped = df.groupby('Team')

print(grouped.agg([np.sum, np.mean, np.std]))

print(grouped['Points'].agg([np.sum, np.mean, np.std]))

print(grouped['Points'].agg({'Points':[np.sum, np.mean, np.std],'Rank':[np.mean]}))  ## 分别指定不同的聚合函数

       Rank                  Year                   Points

        sum  mean       std   sum    mean       std    sum    mean         std

Team

Devils    5  2.50  0.707107  4029  2014.5  0.707107   1536  768.00  134.350288

Kings     9  2.25  1.500000  8062  2015.5  1.290994   3097  774.25   31.899582

Riders    7  1.75  0.500000  8062  2015.5  1.290994   3049  762.25   88.567771

Royals    5  2.50  2.121320  4029  2014.5  0.707107   1505  752.50   72.831998

         sum    mean         std

Team

Devils  1536  768.00  134.350288

Kings   3097  774.25   31.899582

Riders  3049  762.25   88.567771

Royals  1505  752.50   72.831998

       Points                        Rank

          sum    mean         std    mean

Team

Devils   1536  768.00  134.350288  768.00

Kings    3097  774.25   31.899582  774.25

Riders   3049  762.25   88.567771  762.25

Royals   1505  752.50   72.831998  752.50

/home/disk1/data/tangshengyu_dxm/tools/env_py36/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: using a dict on a Series for aggregation

is deprecated and will be removed in a future version

  """

## grouped数据重新生成dataframe

print(df.groupby('Year')['Team'].apply(len).reset_index())   ## 一级列名

print(df.groupby('Year')['Team'].apply(len).to_frame())      ## 多级列名，列变为索引

   Year  Team

0  2014     4

1  2015     4

2  2016     2

3  2017     2

      Team

Year

2014     4

2015     4

2016     2

2017     2

## 更改聚合后的列名

grouped_df = grouped.agg({'Points':['min','max','mean']})

print(grouped_df.columns)

print(grouped_df.columns.values)

grouped_df.columns = ['_'.join(col_tuple) for col_tuple in grouped_df.columns.values]

grouped_df.reset_index()

MultiIndex(levels=[['Points'], ['min', 'max', 'mean']],

           labels=[[0, 0, 0], [0, 1, 2]])

[('Points', 'min') ('Points', 'max') ('Points', 'mean')]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

	Team	Points_min	Points_max	Points_mean
0	Devils	673	863	768.00
1	Kings	741	812	774.25
2	Riders	690	876	762.25
3	Royals	701	804	752.50

## group 后的数据进行transform

grouped = df.groupby('Team')

score = lambda x: (x - x.mean())

print(grouped.transform(score))

    Rank  Year  Points

0  -0.75  -1.5  113.75

1   0.25  -0.5   26.75

2  -0.50  -0.5   95.00

3   0.50   0.5  -95.00

4   0.75  -1.5  -33.25

5   1.75  -0.5   37.75

6  -1.25   0.5  -18.25

7  -1.25   1.5   13.75

8   0.25   0.5  -68.25

9   1.50  -0.5  -51.50

10 -1.50   0.5   51.50

11  0.25   1.5  -72.25

## filter 过滤 (返回满足条件的)

grouped = df.groupby('Team')

print(grouped.filter(lambda x: len(x)>3))

      Team  Rank  Year  Points

0   Riders     1  2014     876

1   Riders     2  2015     789

4    Kings     3  2014     741

5    Kings     4  2015     812

6    Kings     1  2016     756

7    Kings     1  2017     788

8   Riders     2  2016     694

11  Riders     2  2017     690

## 每个分组的数据量

grouped = df.groupby('Team')

print(grouped.apply(lambda x: len(x)))

print(type(grouped.apply(lambda x: len(x))))

Team

Devils    2

Kings     4

Riders    4

Royals    2

dtype: int64

<class 'pandas.core.series.Series'>

## 多行字符串组合成一行

print(df)

df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()

print(df_grouped)

      Team  Rank  Year  Points

0   Riders     1  2014     876

1   Riders     2  2015     789

2   Devils     2  2014     863

3   Devils     3  2015     673

4    Kings     3  2014     741

5    Kings     4  2015     812

6    Kings     1  2016     756

7    Kings     1  2017     788

8   Riders     2  2016     694

9   Royals     4  2014     701

10  Royals     1  2015     804

11  Riders     2  2017     690

   Year                        Team

0  2014  Riders;Devils;Kings;Royals

1  2015  Riders;Devils;Kings;Royals

2  2016                Kings;Riders

3  2017                Kings;Riders

## 一行变多行

def explode(df,tar_col_name):

    tar_col_list = [tar_col_name]

    rem_col_list = df.columns.difference(tar_col_list)

    rem_col_list = list(rem_col_list)

    df_new = df.set_index(rem_col_list)

    df_explode = pd.DataFrame(df_new[tar_col_name].tolist(),index=df_new.index)

    df_explode = df_explode.stack().to_frame()

    df_explode.columns = tar_col_list

    df_explode = df_explode.reset_index(level= rem_col_list)

    return df_explode

df_grouped['Team'] = df_grouped['Team'].apply(lambda s:s.split(';')) ## 先split得到list

print(df_grouped)

explode(df_grouped,'Team')

   Year                             Team

0  2014  [Riders, Devils, Kings, Royals]

1  2015  [Riders, Devils, Kings, Royals]

2  2016                  [Kings, Riders]

3  2017                  [Kings, Riders]

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

	Year	Team
0	2014	Riders
1	2014	Devils
2	2014	Kings
3	2014	Royals
0	2015	Riders
1	2015	Devils
2	2015	Kings
3	2015	Royals
0	2016	Kings
1	2016	Riders
0	2017	Kings
1	2017	Riders

# 将多列合并成一列

data = [['Alex', 10, 150], ['Bob',  12, 153], ['Clarke', 13, 160], ['Tom', 12, 160]]

df = pd.DataFrame(data, columns=['Name', 'Age', 'Stature'])

print(df)

df_new = df['Age'].astype(str) +'-'+ df['Stature'].astype(str)

print(df_new)

     Name  Age  Stature

0    Alex   10      150

1     Bob   12      153

2  Clarke   13      160

3     Tom   12      160

0    10-150

1    12-153

2    13-160

3    12-160

dtype: object

## 一列拆分成多列

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',

         'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],

         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],

         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],

         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

df = pd.DataFrame(ipl_data)

df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()

print(df_grouped)

df_grouped['Team'].str.split(';', expand=True)

   Year                        Team

0  2014  Riders;Devils;Kings;Royals

1  2015  Riders;Devils;Kings;Royals

2  2016                Kings;Riders

3  2017                Kings;Riders

.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}

.dataframe tbody tr th {

    vertical-align: top;

}

.dataframe thead th {

    text-align: right;

}

	0	1	2	3
0	Riders	Devils	Kings	Royals
1	Riders	Devils	Kings	Royals
2	Kings	Riders	None	None
3	Kings	Riders	None	None

def df2libsvm(df,missing_value='-9999'):

    re_list=[]

    length=len(df)

    for i in range(length):

        row_i=df.iloc[i]

        row_dict=row_i.to_dict()

        row_list=[]

        for key in row_dict:

            if row_dict[key]==missing_value:

                continue

            row_list.append('%s:%s'%(key,str(row_dict[key])))

        re_list.append(row_list)

    return re_list

def libsvm2df():

    """

    mydict = [{'b': 2, 'c': 3, 'd': 4},

...           {'a': 100, 'c': 300, 'd': 400},

...           {'a': 1000, 'b': 2000, 'c': 3000}]

    df=pd.DataFrame(mydict)

    """

def calcu_iv(df,feat_col,label_col,good,bad):

    import numpy as np

    def f(x,label_col,good,bad):

        d = {}

        d['bin_bad_cnt'] = (x[label_col]==bad).sum()

        d['bin_good_cnt'] = (x[label_col]==good).sum()

        return pd.Series(d, index=['bin_good_cnt', 'bin_bad_cnt'])

    df_woe = df.groupby(feat_col).apply(f,label_col=label_col,good=good,bad=bad).reset_index()

    all_good_cnt = df_woe.bin_good_cnt.sum()

    all_bad_cnt = df_woe.bin_bad_cnt.sum()

    if all_bad_cnt==0:

        all_bad_cnt=1

    if all_good_cnt==0:

        all_good_cnt=1

    df_woe = df_woe.replace({'bin_bad_cnt': {0: 0.1}})

    df_woe = df_woe.replace({'bin_good_cnt': {0: 0.1}})

    df_woe['distribution_good'] = df_woe['bin_good_cnt']/float(all_good_cnt)

    df_woe['distribution_bad'] = df_woe['bin_bad_cnt']/float(all_bad_cnt)

    df_woe['WoE'] = np.log(df_woe['distribution_good']/df_woe['distribution_bad'])

    df_woe['IV'] = df_woe['WoE'] * (df_woe['distribution_good'] - df_woe['distribution_bad'])

    df_woe_inf = df_woe[df_woe['WoE']==np.inf]

    iv = df_woe['IV'].sum()

    return iv,df_woe

pandas 使用总结的更多相关文章

pandas基础-Python3
未完 for examples: example 1: # Code based on Python 3.x # _*_ coding: utf-8 _*_ # __Author: "LEM ...
10 Minutes to pandas
摘要一.创建对象二.查看数据三.选择和设置四.缺失值处理五.相关操作六.聚合七.重排(Reshaping) 八.时间序列九.Categorical类型十.画图十一 ...
利用Python进行数据分析(15) pandas基础: 字符串操作
字符串对象方法 split()方法拆分字符串: strip()方法去掉空白符和换行符: split()结合strip()使用: "+"符号可以将多个字符串连接起来: join( ...
利用Python进行数据分析(10) pandas基础: 处理缺失数据
数据不完整在数据分析的过程中很常见. pandas使用浮点值NaN表示浮点和非浮点数组里的缺失数据. pandas使用isnull()和notnull()函数来判断缺失情况. 对于缺失数据一般处理 ...
利用Python进行数据分析(12) pandas基础: 数据合并
pandas 提供了三种主要方法可以对数据进行合并: pandas.merge()方法:数据库风格的合并: pandas.concat()方法:轴向连接,即沿着一条轴将多个对象堆叠到一起: 实例方法c ...
利用Python进行数据分析(9) pandas基础: 汇总统计和计算
pandas 对象拥有一些常用的数学和统计方法. 例如,sum() 方法,进行列小计: sum() 方法传入 axis=1 指定为横向汇总,即行小计: idxmax() 获取最大值对应的索 ...
利用Python进行数据分析(8) pandas基础: Series和DataFrame的基本操作
一.reindex() 方法:重新索引针对 Series 重新索引指的是根据index参数重新进行排序. 如果传入的索引值在数据里不存在,则不会报错,而是添加缺失值的新行. 不想用缺失值,可以用 ...
利用Python进行数据分析(7) pandas基础: Series和DataFrame的简单介绍
一.pandas 是什么 pandas 是基于 NumPy 的一个 Python 数据分析包,主要目的是为了数据分析.它提供了大量高级的数据结构和对数据处理的方法. pandas 有两个主要的数据结构 ...
pandas.DataFrame对行和列求和及添加新行和列
导入模块: from pandas import DataFrame import pandas as pd import numpy as np 生成DataFrame数据 df = DataFra ...
pandas.DataFrame排除特定行
使用Python进行数据分析时,经常要使用到的一个数据结构就是pandas的DataFrame 如果我们想要像Excel的筛选那样,只要其中的一行或某几行,可以使用isin()方法,将需要的行的值以列 ...

随机推荐

在Git的PR(Pull Request)提示冲突无法merge合并的解决方案
问题假设有一个分支A,向master分支提交PR,然后发生无法自动解决的冲突,PR提示不能执行merge合并. 解决方案1 本地checkout检出并切换到A分支,pull拉取更新到最新代码在本地 ...
CF1200D White Lines | 前缀和
传送门 Examples input 1 4 2 BWWW WBBW WBBW WWWB output 1 4 input 2 3 1 BWB WWB BWB output 2 2 input 3 5 ...
P2871 [USACO07DEC]手链Charm Bracelet（01背包模板）
题目传送门:P2871 [USACO07DEC]手链Charm Bracelet 题目描述 Bessie has gone to the mall's jewelry store and spies ...
SpringBoot系列之集成Dubbo的方式
SpringBoot系列之集成Dubbo的方式本博客介绍Springboot框架集成Dubbo实现微服务的3种常用方式,对于Dubbo知识不是很熟悉的,请先学习我上一篇博客:SpringBoot系列 ...
Python使用requests爬取一个网页并保存
#导入 requests模块import requests #设置请求头,让网站监测是浏览器 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6. ...
git 删除和复制远程分支储藏
不常用到命令经常查,特别记录,方便自己开箱即用. 1. 删除本地分支 git branch -D <dev> 2. 删除远程分支 git push origin --delete < ...
如何用好Go的测试黑科技
测试是每一个开发人员都需要掌握的技能,尽管你不需要像测试人员那么专业,但你也应该尽可能的做到那么专业,据我了解到我身边的一些Go开发人员,他们对Go的测试仅仅局限于写一个_test.go 测试文件,对 ...
Nginx作为负载均衡——实战演练
配置语法 Syntax:upstream name {...} Default:—— Context:http 演示准备两台虚拟主机192.168.96.188.192.168.96.188 在18 ...
今天我的jupyter notebook打不开了，报错原因'No module named 'zmq.eventloop'
今天我的jupyter notebook打不开了,就是那种一打开出现黑色界面就退出的那种,惊恐爬上了我的面颊. 找了一个小时,试了好几种办法(包括别人说的什么把属性里面后面的%%的去掉)终究无果打开 ...
python 进程管道
数据不安全,不常用 import time from multiprocessing import Pipe, Process def producer(prod, cons, name, food) ...

pandas 使用总结

pandas 使用总结的更多相关文章

随机推荐

热门专题