


  1. # Create the range of dates here
  2. seven_days = pd.date_range('2017-1-1', periods=7)
  3. # Iterate over the dates and print the number and name of the weekday
  4. for day in seven_days:
  5. print(day.dayofweek, day.weekday_name)
  6. <script.py> output:
  7. 6 Sunday
  8. 0 Monday
  9. 1 Tuesday
  10. 2 Wednesday
  11. 3 Thursday
  12. 4 Friday
  13. 5 Saturday



  1. data = pd.read_csv('nyc.csv')
  2. # Inspect data
  3. print(data.info())
  4. # Convert the date column to datetime64
  5. data.date = pd.to_datetime(data.date)
  6. # Set date column as index
  7. data.set_index('date', inplace=True)
  8. # Inspect data
  9. print(data.info())
  10. # Plot data
  11. data.plot(subplots=True)
  12. plt.show()
  13. <script.py> output:
  14. <class 'pandas.core.frame.DataFrame'>
  15. RangeIndex: 6317 entries, 0 to 6316
  16. Data columns (total 4 columns):
  17. date 6317 non-null object
  18. ozone 6317 non-null float64
  19. pm25 6317 non-null float64
  20. co 6317 non-null float64
  21. dtypes: float64(3), object(1)
  22. memory usage: 197.5+ KB
  23. None
  24. <class 'pandas.core.frame.DataFrame'>
  25. DatetimeIndex: 6317 entries, 1999-07-01 to 2017-03-31
  26. Data columns (total 3 columns):
  27. ozone 6317 non-null float64
  28. pm25 6317 non-null float64
  29. co 6317 non-null float64
  30. dtypes: float64(3)
  31. memory usage: 197.4 KB
  32. None

  1. # Create dataframe prices here
  2. prices = pd.DataFrame()
  3. # Select data for each year and concatenate with prices here
  4. for year in ['2013', '2014', '2015']:
  5. price_per_year = yahoo.loc[year, ['price']].reset_index(drop=True)
  6. price_per_year.rename(columns={'price': year}, inplace=True)
  7. prices = pd.concat([prices, price_per_year], axis=1)
  8. # Plot prices
  9. prices.plot()
  10. plt.show()



  1. # Inspect data
  2. print(co.info())
  3. # Set the frequency to calendar daily
  4. co = co.asfreq('D')
  5. # Plot the data
  6. co.plot(subplots=True)
  7. plt.show()
  8. # Set frequency to monthly
  9. co = co.asfreq('M')
  10. # Plot the data
  11. co.plot(subplots=True)
  12. plt.show()







  1. # Import data here
  2. google = pd.read_csv('google.csv', parse_dates=['Date'], index_col='Date')
  3. # Set data frequency to business daily
  4. google = google.asfreq('B')
  5. # Create 'lagged' and 'shifted'
  6. google['lagged'] = google.Close.shift(periods=-90)
  7. google['shifted'] = google.Close.shift(periods=90)
  8. # Plot the google price series
  9. google.plot()
  10. plt.show()


  • 减:.sub()
  • 加:.add()
  • 成:.mul()
  • 除:.sub()



  1. # Convert index series to dataframe here
  2. data = index.to_frame('Index')
  3. # Normalize djia series and add as new column to data
  4. djia = djia.div(djia.iloc[0]).mul(100)
  5. data['DJIA'] = djia
  6. # Show total return for both index and djia
  7. print(data.iloc[-1].div(data.iloc[0]).sub(1).mul(100))
  1. # Create daily_return
  2. google['daily_return'] = google.Close.pct_change().mul(100)
  3. # Create monthly_return
  4. google['monthly_return'] = google.Close.pct_change(30).mul(100)
  5. # Create annual_return
  6. google['annual_return'] = google.Close.pct_change(360).mul(100)
  7. # Plot the result
  8. google['daily_return']
  9. google.plot(subplots=True)
  10. plt.show()



  1. # Import data here
  2. prices = pd.read_csv('asset_classes.csv',parse_dates=['DATE'],index_col='DATE')
  3. # Inspect prices here
  4. print(prices.info())
  5. # Select first prices
  6. first_prices = prices.iloc[0]
  7. # Create normalized
  8. normalized = prices.div(first_prices).mul(100)
  9. # Plot normalized
  10. #画图这个地方老是写错,记住直接调用
  11. normalized.plot()
  12. plt.show()

  1. # Create tickers
  2. tickers = ['MSFT', 'AAPL']
  3. # Import stock data here
  4. stocks = pd.read_csv('msft_aapl.csv', parse_dates=['date'], index_col='date')
  5. # Import index here
  6. sp500 = pd.read_csv('sp500.csv', parse_dates=['date'], index_col='date')
  7. # Concatenate stocks and index here
  8. data = pd.concat([stocks, sp500], axis=1).dropna()
  9. # Normalize data
  10. normalized = data.div(data.iloc[0]).mul(100)
  11. # Subtract the normalized index from the normalized stock prices, and plot the result
  12. normalized[tickers].sub(normalized['SP500'], axis=0).plot()
  13. plt.show()



DataFrame.reindex(self, labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None)[source]


  1. # Set start and end dates
  2. start = '2016-1-1'
  3. end = '2016-2-29'
  4. # Create monthly_dates here
  5. #这个就是创建一个指定的起止时间,然后有相同的时间间隔
  6. monthly_dates = pd.date_range(start=start, end=end, freq='M')
  7. # Create monthly here,构造一个时间序列,但是要给一个时间戳
  8. monthly = pd.Series(data=[1,2], index=monthly_dates)
  9. print(monthly)
  10. # Create weekly_dates here
  11. weekly_dates = pd.date_range(start=start, end=end, freq='W')
  12. # Print monthly, reindexed using weekly_dates
  13. print(monthly.reindex(weekly_dates))
  14. print(monthly.reindex(weekly_dates, method='bfill'))
  15. print(monthly.reindex(weekly_dates, method='ffill'))
  16. #ffill : foreaward fill 向前填充,
  17. #如果新增加索引的值不存在,那么按照前一个非nan的值填充进去
  18. 同理,bfill是后向补充
  19. <script.py> output:
  20. 2016-01-31 1
  21. 2016-02-29 2
  22. Freq: M, dtype: int64
  23. 2016-01-03 NaN
  24. 2016-01-10 NaN
  25. 2016-01-17 NaN
  26. 2016-01-24 NaN
  27. 2016-01-31 1.0
  28. 2016-02-07 NaN
  29. 2016-02-14 NaN
  30. 2016-02-21 NaN
  31. 2016-02-28 NaN
  32. Freq: W-SUN, dtype: float64
  33. 2016-01-03 1
  34. 2016-01-10 1
  35. 2016-01-17 1
  36. 2016-01-24 1
  37. 2016-01-31 1
  38. 2016-02-07 2
  39. 2016-02-14 2
  40. 2016-02-21 2
  41. 2016-02-28 2
  42. Freq: W-SUN, dtype: int64
  43. 2016-01-03 NaN
  44. 2016-01-10 NaN
  45. 2016-01-17 NaN
  46. 2016-01-24 NaN
  47. 2016-01-31 1.0
  48. 2016-02-07 1.0
  49. 2016-02-14 1.0
  50. 2016-02-21 1.0
  51. 2016-02-28 1.0
  52. Freq: W-SUN, dtype: float64



  1. # Import data here
  2. data = pd.read_csv('unemployment.csv', parse_dates=['date'], index_col='date')
  3. # Show first five rows of weekly series
  4. print(data.asfreq('W').head())
  5. # Show first five rows of weekly series with bfill option
  6. print(data.asfreq('W', method='bfill').head())
  7. # Create weekly series with ffill option and show first five rows
  8. weekly_ffill = data.asfreq('W', method='ffill')
  9. print(weekly_ffill.head())
  10. # Plot weekly_fill starting 2015 here
  11. weekly_ffill.loc['2015':].plot()
  12. plt.show()




  1. >>> s = pd.Series([np.nan, "single_one", np.nan,
  2. ... "fill_two_more", np.nan, np.nan, np.nan,
  3. ... 4.71, np.nan])
  4. >>> s
  5. 0 NaN
  6. 1 single_one
  7. 2 NaN
  8. 3 fill_two_more
  9. 4 NaN
  10. 5 NaN
  11. 6 NaN
  12. 7 4.71
  13. 8 NaN
  14. dtype: object
  15. >>> s.interpolate(method='pad', limit=2)
  16. 0 NaN
  17. 1 single_one
  18. 2 single_one
  19. 3 fill_two_more
  20. 4 fill_two_more
  21. 5 fill_two_more
  22. 6 NaN
  23. 7 4.71
  24. 8 4.71
  25. dtype: object



DataFrame.resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None)

  1. # Import and inspect data here
  2. ozone = pd.read_csv('ozone.csv',parse_dates=['date'],index_col='date')
  3. print(ozone.info())
  4. # Calculate and plot the weekly average ozone trend
  5. #日期的fre是week,并且求出每周的平均值
  6. ozone.resample('W').mean().plot()
  7. plt.show()






  1. >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
  2. >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
  3. >>> ts
  4. A
  5. 2018-04-09 1
  6. 2018-04-11 2
  7. 2018-04-13 3
  8. 2018-04-15 4
  9. Get the rows for the first 3 days:
  10. >>> ts.first('3D')
  11. A
  12. 2018-04-09 1
  13. 2018-04-11 2


DataFrame.pct_change(periods=1, fill_method=‘pad’, limit=None, freq=None, **kwargs)

表示当前元素与先前元素的相差百分比,当然指定periods=n,表示当前元素与先前n 个元素的相差百分比



  1. >>> df = pd.DataFrame({
  2. ... 'FR': [4.0405, 4.0963, 4.3149],
  3. ... 'GR': [1.7246, 1.7482, 1.8519],
  4. ... 'IT': [804.74, 810.01, 860.13]},
  5. ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
  6. >>> df
  7. FR GR IT
  8. 1980-01-01 4.0405 1.7246 804.74
  9. 1980-02-01 4.0963 1.7482 810.01
  10. 1980-03-01 4.3149 1.8519 860.13
  11. >>> df.pct_change()
  12. FR GR IT
  13. 1980-01-01 NaN NaN NaN
  14. 1980-02-01 0.013810 0.013684 0.006549
  15. 1980-03-01 0.053365 0.059318 0.061876


这个函数应该是类似于R 里面的rbind按行拼接,即纵向合并

  1. >>> pd.concat([s1, s2], ignore_index=True)
  2. 0 a
  3. 1 b
  4. 2 c
  5. 3 d
  6. dtype: object




rolling window functions.



查到了 参考


DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None)

  • window







  1. # Import and inspect ozone data here
  2. data = pd.read_csv('ozone.csv', parse_dates=['date'], index_col='date').dropna()
  3. # Calculate the rolling mean and std here
  4. rolling_stats = data.Ozone.rolling(360).agg(['mean', 'std'])
  5. # Join rolling_stats with ozone data
  6. #默认左拼接,有点像R里面的left_join()
  7. stats = data.join(rolling_stats)
  8. # Plot stats
  9. stats.plot(subplots=True);
  10. plt.show()





第1四分位数 (Q1),又称“较小四分位数”,等于该样本中所有数值由小到大排列后第25%的数字。

第2四分位数 (Q2),又称“中位数”,等于该样本中所有数值由小到大排列后第50%的数字。

第3四分位数 (Q3),又称“较大四分位数”,等于该样本中所有数值由小到大排列后第75%的数字。

四分位距(InterQuartile Range, IQR)= 第3四分位数与第1四分位数的差距


  1. # Resample, interpolate and inspect ozone data here
  2. data = data.resample('D').interpolate()
  3. data.info()
  4. # Create the rolling window
  5. rolling = data.rolling(360)['Ozone']
  6. # Insert the rolling quantiles to the monthly returns
  7. data['q10'] = rolling.quantile(.1)
  8. data['q50'] = rolling.quantile(.5)
  9. data['q90'] = rolling.quantile(.9)
  10. # Plot the data
  11. data.plot()
  12. plt.show()



这个函数挺好使的 参考下官方文档 demo很好理解


pandas 中统计累计次数





  1. # Import numpy
  2. import numpy as np
  3. # Define a multi_period_return function
  4. def multi_period_return(period_returns):
  5. return np.prod(period_returns + 1) - 1
  6. # Calculate daily returns
  7. daily_returns = data.pct_change()
  8. # Calculate rolling_annual_returns
  9. rolling_annual_returns = daily_returns.rolling('360D').apply(multi_period_return)
  10. # Plot rolling_annual_returns
  11. rolling_annual_returns.mul(100).plot();
  12. plt.show()


  1. # Create multi_period_return function here
  2. def multi_period_return(r):
  3. return (np.prod(r + 1) - 1) * 100







random walk

  1. # Set seed here
  2. seed(42)
  3. # Create random_walk
  4. random_walk = normal(loc=.001, scale=0.01, size=2500)
  5. # Convert random_walk to pd.series
  6. random_walk = pd.Series(random_walk)
  7. # Create random_prices
  8. random_prices = random_walk.add(1).cumprod()
  9. # Plot random_prices here
  10. random_prices.mul(1000).plot()
  11. plt.show();




Relationships between time series: correlation




  1. # Inspect data here
  2. print(data.info())
  3. # Calculate year-end prices here
  4. annual_prices = data.resample('A').last()
  5. # Calculate annual returns here
  6. annual_returns = annual_prices.pct_change()
  7. # Calculate and print the correlation matrix here
  8. correlations = annual_returns.corr()
  9. print(correlations)
  10. # Visualize the correlations as heatmap here
  11. sns.heatmap(correlations, annot=True)
  12. plt.show();

Select index components & import data




  1. # Select largest company for each sector
  2. components = listings.groupby(['Sector'])['Market Capitalization'].nlargest(1)
  3. # Print components, sorted by market cap
  4. print(components.sort_values(ascending=False))
  5. # Select stock symbols and print the result
  6. tickers = components.index.get_level_values('Stock Symbol')
  7. print(tickers)
  8. # Print company name, market cap, and last price for each component
  9. info_cols = ['Company Name', 'Market Capitalization', 'Last Sale']
  10. print(listings.loc[tickers, info_cols].sort_values('Market Capitalization', ascending=False))
  11. <script.py> output:
  12. Sector Stock Symbol
  13. Technology AAPL 740,024.47
  14. Consumer Services AMZN 422,138.53
  15. Miscellaneous MA 123,330.09
  16. Health Care AMGN 118,927.21
  17. Transportation UPS 90,180.89
  18. Finance GS 88,840.59
  19. Basic Industries RIO 70,431.48
  20. Public Utilities TEF 54,609.81
  21. Consumer Non-Durables EL 31,122.51
  22. Capital Goods ILMN 25,409.38
  23. Energy PAA 22,223.00
  24. Consumer Durables CPRT 13,620.92
  25. Name: Market Capitalization, dtype: float64
  26. Index(['RIO', 'ILMN', 'CPRT', 'EL', 'AMZN', 'PAA', 'GS', 'AMGN', 'MA', 'TEF', 'AAPL', 'UPS'], dtype='object', name='Stock Symbol')
  27. Company Name Market Capitalization Last Sale
  28. Stock Symbol
  29. AAPL Apple Inc. 740,024.47 141.05
  30. AMZN Amazon.com, Inc. 422,138.53 884.67
  31. MA Mastercard Incorporated 123,330.09 111.22
  32. AMGN Amgen Inc. 118,927.21 161.61
  33. UPS United Parcel Service, Inc. 90,180.89 103.74
  34. GS Goldman Sachs Group, Inc. (The) 88,840.59 223.32
  35. RIO Rio Tinto Plc 70,431.48 38.94
  36. TEF Telefonica SA 54,609.81 10.84
  37. EL Estee Lauder Companies, Inc. (The) 31,122.51 84.94
  38. ILMN Illumina, Inc. 25,409.38 173.68
  39. PAA Plains All American Pipeline, L.P. 22,223.00 30.72
  40. CPRT Copart, Inc. 13,620.92 29.65







  1. # Select the number of shares
  2. no_shares = components['Number of Shares']
  3. print(no_shares.sort_values())
  4. # Create the series of market cap per ticker
  5. market_cap = stock_prices.mul(no_shares)
  6. # Select first and last market cap here
  7. first_value = market_cap.iloc[0]
  8. last_value = market_cap.iloc[-1]
  9. # Concatenate and plot first and last market cap here
  10. pd.concat([first_value, last_value], axis=1).plot(kind='barh')
  11. plt.show()



  1. # Export data and data as returns to excel
  2. with pd.ExcelWriter('data.xls') as writer:
  3. data.to_excel(writer, sheet_name='data')
  4. returns.to_excel(writer, sheet_name='returns')



  1. pandas包学习笔记

    目录 zip Importing & exporting data Plotting with pandas Visual exploratory data analysis 折线图 散点图 ...

  2. pandas库学习笔记(二)DataFrame入门学习

    Pandas基本介绍——DataFrame入门学习 前篇文章中,小生初步介绍pandas库中的Series结构的创建与运算,今天小生继续“死磕自己”为大家介绍pandas库的另一种最为常见的数据结构D ...

  3. 初步了解pandas(学习笔记)

    1 pandas简介 pandas 是一种列存数据分析 API.它是用于处理和分析输入数据的强大工具,很多机器学习框架都支持将 pandas 数据结构作为输入. 虽然全方位介绍 pandas API ...

  4. pandas库学习笔记(一)Series入门学习

    Pandas基本介绍: pandas is an open source, BSD-licensed (permissive free software licenses) library provi ...

  5. python的pandas库学习笔记

    导入: import pandas as pd from pandas import Series,DataFrame 1.两个主要数据结构:Series和DataFrame (1)Series是一种 ...

  6. Pandas DataFrame学习笔记

    对一个DF r1  r2  r3 c1 c2 c3 选行:  df['r1']  df['r2':'r2']  #包含r2  df[df['c1']>5] #按条件选 选列:  df['c1'] ...

  7. 数据分析之Pandas和Numpy学习笔记(持续更新)<1>

    pandas and numpy notebook        最近工作交接,整理电脑资料时看到了之前的基于Jupyter学习数据分析相关模块学习笔记.想着拿出来分享一下,可是Jupyter导出来h ...

  8. Pandas学习笔记

    本学习笔记来自于莫烦Python,原视频链接 一.Pandas基本介绍和使用 Series数据结构:索引在左,值在右 import pandas as pd import numpy as np s ...

  9. Pandas 学习笔记

    Pandas 学习笔记 pandas 由两部份组成,分别是 Series 和 DataFrame. Series 可以理解为"一维数组.列表.字典" DataFrame 可以理解为 ...


