1.import 模块

import os

import tarfile

from six.moves import urllib

import pandas as pd

pd.set_option('display.width', None)

import matplotlib.pyplot as plt

import numpy as np

import hashlib

2.获取数据模块

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"

HOUSING_PATH = "datasets/housing"

HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

print(HOUSING_URL)

https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):

    if not os.path.isdir(housing_path):

        os.makedirs(housing_path)

    tgz_path = os.path.join(housing_path, "housing.tgz")

    urllib.request.urlretrieve(housing_url, tgz_path)

    housing_tgz = tarfile.open(tgz_path)

    housing_tgz.extractall(path=housing_path)

    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):  # 加载数据函数

    csv_path = os.path.join(housing_path, "housing.csv")

    print(csv_path)

    return pd.read_csv(csv_path)  # 返回一个pandas DataFrame对象

2.1查看数据

housing = load_housing_data()

print("---" * 20)

print("         查看pandas DataFrame对象的头部(前5行")

print("---" * 20)

print(housing.head())  # 查看pandas DataFrame对象的头部

print("---" * 20)

print("         查看pandas DataFrame的具体信息")

print("---" * 20)

print(housing.info())  # 查看pandas DataFrmae的具体信息

print("---" * 20)

print("         查看pandas DataFrame中ocean_proximity字段的分类信息")

print("---" * 20)

print(housing["ocean_proximity"].value_counts())

print("---" * 20)

print("         查看具体数值属性的摘要")

print("---" * 20)

print(housing.describe())

datasets/housing\housing.csv

------------------------------------------------------------

         查看pandas DataFrame对象的头部(前5行

------------------------------------------------------------

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \

0    -122.23     37.88                41.0        880.0           129.0

1    -122.22     37.86                21.0       7099.0          1106.0

2    -122.24     37.85                52.0       1467.0           190.0

3    -122.25     37.85                52.0       1274.0           235.0

4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity

0       322.0       126.0         8.3252            452600.0        NEAR BAY

1      2401.0      1138.0         8.3014            358500.0        NEAR BAY

2       496.0       177.0         7.2574            352100.0        NEAR BAY

3       558.0       219.0         5.6431            341300.0        NEAR BAY

4       565.0       259.0         3.8462            342200.0        NEAR BAY

------------------------------------------------------------

         查看pandas DataFrame的具体信息

------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 20640 entries, 0 to 20639

Data columns (total 10 columns):

longitude             20640 non-null float64

latitude              20640 non-null float64

housing_median_age    20640 non-null float64

total_rooms           20640 non-null float64

total_bedrooms        20433 non-null float64

population            20640 non-null float64

households            20640 non-null float64

median_income         20640 non-null float64

median_house_value    20640 non-null float64

ocean_proximity       20640 non-null object

dtypes: float64(9), object(1)

memory usage: 1.6+ MB

None

------------------------------------------------------------

         查看pandas DataFrame中ocean_proximity字段的分类信息

------------------------------------------------------------

<1H OCEAN     9136

INLAND        6551

NEAR OCEAN    2658

NEAR BAY      2290

ISLAND           5

Name: ocean_proximity, dtype: int64

------------------------------------------------------------

         查看具体数值属性的摘要

------------------------------------------------------------

          longitude      latitude  housing_median_age   total_rooms  \

count  20640.000000  20640.000000        20640.000000  20640.000000

mean    -119.569704     35.631861           28.639486   2635.763081

std        2.003532      2.135952           12.585558   2181.615252

min     -124.350000     32.540000            1.000000      2.000000

25%     -121.800000     33.930000           18.000000   1447.750000

50%     -118.490000     34.260000           29.000000   2127.000000

75%     -118.010000     37.710000           37.000000   3148.000000

max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \

count    20433.000000  20640.000000  20640.000000   20640.000000

mean       537.870553   1425.476744    499.539680       3.870671

std        421.385070   1132.462122    382.329753       1.899822

min          1.000000      3.000000      1.000000       0.499900

25%        296.000000    787.000000    280.000000       2.563400

50%        435.000000   1166.000000    409.000000       3.534800

75%        647.000000   1725.000000    605.000000       4.743250

max       6445.000000  35682.000000   6082.000000      15.000100   

       median_house_value

count        20640.000000

mean        206855.816909

std         115395.615874

min          14999.000000

25%         119600.000000

50%         179700.000000

75%         264725.000000

max         500001.000000

bins : integer or array_like, optional

这个参数指定bin(箱子)的个数,也就是总共有几条条状图

igsize The size in inches of the figure to create. Uses the value in matplotlib.rcParams by default. 这个参数指创建图形的大小

housing.hist(bins=50, figsize=(20, 20))  # 绘制housing数据的直方图

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002393010CEF0>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x000002393040C780>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023930667A90>],

       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023930691DA0>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x00000239306C40F0>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x00000239306C4128>],

       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000239307156D8>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x000002393073E9E8>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023930769CF8>]],

      dtype=object)

3. 使用随机抽样创建测试集

def split_train_test(data, test_ratio):

    shuffled_indices = np.random.permutation(len(data))  # 随机洗牌

    test_set_size = int(len(data) * test_ratio)

    test_indices = shuffled_indices[:test_set_size]  # 取test_set

    train_indices = shuffled_indices[test_set_size:]  # 取train_set

    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)

print(len(train_set), "train +", len(test_set), "test")

16512 train + 4128 test

train_set和test_set分割后的问题：

按如上办法分割出20%的test_set，但是这个方法有一个问题，就是每一次运行程序时的test_set和train_set的数据集都不一样，这样数据分析的时候就会有问题。

解决方案：

Option1：在程序第一次运行时保存test_set和train_set，以后每次运行都加载它们。
Option2：在每次调用random.permutation生成随机序列的时候都生成一个随机索引，比如：np.random.seed(),每次执行方法查找索引找数据集。

  这两个方法同样有问题，就是每当数据更新时，程序要重新开始运行，所有的从头开始。所以有Option3。

Option3：对每一个实例都用一个标识符来决定是否让他加入数据集中。如:给每一个实例都加一个唯一的标识符，计算每个标识符的hash值，取hash的最后一个字节，该值小于51（256的20%）就让他加入测试集。注：使用这个方法要保证在加入新数据时各个数据的index不变，否则就没有意义了。

3.1 Option3 实现

def test_set_check(identifer, test_ratio, hash):

    return hash(np.int64(identifer)).digest()[-1] < 256 * test_ratio  # 检查最后一位

def split_train_test_by_id(data, test_radio, id_colum, hash=hashlib.md5):

    ids = data[id_colum]

    int_test_set = ids.apply(lambda id_: test_set_check(id_, test_radio, hash))

    return data.loc[~int_test_set], data.loc[

        int_test_set]  # loc：通过行标签索引数据 iloc：通过行号索引行数据 ix：通过行标签或行号索引数据（基于loc和iloc的混合）

housing_with_id = housing.reset_index()  # 给数据加colum->index

'''

train_set,test_set=split_train_test_by_id(housing_with_id,0.2,"index")  # 使用普通序列作为标识符

'''

housing_with_id["id"] = housing["longitude"] * 1000 +     housing["latitude"]  # 使用经纬度作为标识符（可以保证标识符不变）

train_set_by_id, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

在这个数据集中有部分房屋数据的经纬度是一样的，所以一部分的ID也是一样的位置信息实际上是相当粗粒度的，许多区域可能会拥有完全相同的ID，结果就是它们会被纳入同一个集合（测试集或者训练集）。而这有可能会导致一些抽样偏差。

print("---" * 20)

print("         加入标识符的数据：")

print("---" * 20)

print(housing_with_id)

------------------------------------------------------------

         加入标识符的数据：

------------------------------------------------------------

       index  longitude  latitude  housing_median_age  total_rooms  \

0          0    -122.23     37.88                41.0        880.0

1          1    -122.22     37.86                21.0       7099.0

2          2    -122.24     37.85                52.0       1467.0

3          3    -122.25     37.85                52.0       1274.0

4          4    -122.25     37.85                52.0       1627.0

5          5    -122.25     37.85                52.0        919.0

6          6    -122.25     37.84                52.0       2535.0

7          7    -122.25     37.84                52.0       3104.0

8          8    -122.26     37.84                42.0       2555.0

9          9    -122.25     37.84                52.0       3549.0

10        10    -122.26     37.85                52.0       2202.0

11        11    -122.26     37.85                52.0       3503.0

12        12    -122.26     37.85                52.0       2491.0

13        13    -122.26     37.84                52.0        696.0

14        14    -122.26     37.85                52.0       2643.0

15        15    -122.26     37.85                50.0       1120.0

16        16    -122.27     37.85                52.0       1966.0

17        17    -122.27     37.85                52.0       1228.0

18        18    -122.26     37.84                50.0       2239.0

19        19    -122.27     37.84                52.0       1503.0

20        20    -122.27     37.85                40.0        751.0

21        21    -122.27     37.85                42.0       1639.0

22        22    -122.27     37.84                52.0       2436.0

23        23    -122.27     37.84                52.0       1688.0

24        24    -122.27     37.84                52.0       2224.0

25        25    -122.28     37.85                41.0        535.0

26        26    -122.28     37.85                49.0       1130.0

27        27    -122.28     37.85                52.0       1898.0

28        28    -122.28     37.84                50.0       2082.0

29        29    -122.28     37.84                52.0        729.0

...      ...        ...       ...                 ...          ...

20610  20610    -121.56     39.10                28.0       2130.0

20611  20611    -121.55     39.10                27.0       1783.0

20612  20612    -121.56     39.08                26.0       1377.0

20613  20613    -121.55     39.09                31.0       1728.0

20614  20614    -121.54     39.08                26.0       2276.0

20615  20615    -121.54     39.08                23.0       1076.0

20616  20616    -121.53     39.08                15.0       1810.0

20617  20617    -121.53     39.06                20.0        561.0

20618  20618    -121.55     39.06                25.0       1332.0

20619  20619    -121.56     39.01                22.0       1891.0

20620  20620    -121.48     39.05                40.0        198.0

20621  20621    -121.47     39.01                37.0       1244.0

20622  20622    -121.44     39.00                20.0        755.0

20623  20623    -121.37     39.03                32.0       1158.0

20624  20624    -121.41     39.04                16.0       1698.0

20625  20625    -121.52     39.12                37.0        102.0

20626  20626    -121.43     39.18                36.0       1124.0

20627  20627    -121.32     39.13                 5.0        358.0

20628  20628    -121.48     39.10                19.0       2043.0

20629  20629    -121.39     39.12                28.0      10035.0

20630  20630    -121.32     39.29                11.0       2640.0

20631  20631    -121.40     39.33                15.0       2655.0

20632  20632    -121.45     39.26                15.0       2319.0

20633  20633    -121.53     39.19                27.0       2080.0

20634  20634    -121.56     39.27                28.0       2332.0

20635  20635    -121.09     39.48                25.0       1665.0

20636  20636    -121.21     39.49                18.0        697.0

20637  20637    -121.22     39.43                17.0       2254.0

20638  20638    -121.32     39.43                18.0       1860.0

20639  20639    -121.24     39.37                16.0       2785.0   

       total_bedrooms  population  households  median_income  \

0               129.0       322.0       126.0         8.3252

1              1106.0      2401.0      1138.0         8.3014

2               190.0       496.0       177.0         7.2574

3               235.0       558.0       219.0         5.6431

4               280.0       565.0       259.0         3.8462

5               213.0       413.0       193.0         4.0368

6               489.0      1094.0       514.0         3.6591

7               687.0      1157.0       647.0         3.1200

8               665.0      1206.0       595.0         2.0804

9               707.0      1551.0       714.0         3.6912

10              434.0       910.0       402.0         3.2031

11              752.0      1504.0       734.0         3.2705

12              474.0      1098.0       468.0         3.0750

13              191.0       345.0       174.0         2.6736

14              626.0      1212.0       620.0         1.9167

15              283.0       697.0       264.0         2.1250

16              347.0       793.0       331.0         2.7750

17              293.0       648.0       303.0         2.1202

18              455.0       990.0       419.0         1.9911

19              298.0       690.0       275.0         2.6033

20              184.0       409.0       166.0         1.3578

21              367.0       929.0       366.0         1.7135

22              541.0      1015.0       478.0         1.7250

23              337.0       853.0       325.0         2.1806

24              437.0      1006.0       422.0         2.6000

25              123.0       317.0       119.0         2.4038

26              244.0       607.0       239.0         2.4597

27              421.0      1102.0       397.0         1.8080

28              492.0      1131.0       473.0         1.6424

29              160.0       395.0       155.0         1.6875

...               ...         ...         ...            ...

20610           484.0      1195.0       439.0         1.3631

20611           441.0      1163.0       409.0         1.2857

20612           289.0       761.0       267.0         1.4934

20613           365.0      1167.0       384.0         1.4958

20614           460.0      1455.0       474.0         2.4695

20615           216.0       724.0       197.0         2.3598

20616           441.0      1157.0       375.0         2.0469

20617           109.0       308.0       114.0         3.3021

20618           247.0       726.0       226.0         2.2500

20619           340.0      1023.0       296.0         2.7303

20620            41.0       151.0        48.0         4.5625

20621           247.0       484.0       157.0         2.3661

20622           147.0       457.0       157.0         2.4167

20623           244.0       598.0       227.0         2.8235

20624           300.0       731.0       291.0         3.0739

20625            17.0        29.0        14.0         4.1250

20626           184.0       504.0       171.0         2.1667

20627            65.0       169.0        59.0         3.0000

20628           421.0      1018.0       390.0         2.5952

20629          1856.0      6912.0      1818.0         2.0943

20630           505.0      1257.0       445.0         3.5673

20631           493.0      1200.0       432.0         3.5179

20632           416.0      1047.0       385.0         3.1250

20633           412.0      1082.0       382.0         2.5495

20634           395.0      1041.0       344.0         3.7125

20635           374.0       845.0       330.0         1.5603

20636           150.0       356.0       114.0         2.5568

20637           485.0      1007.0       433.0         1.7000

20638           409.0       741.0       349.0         1.8672

20639           616.0      1387.0       530.0         2.3886   

       median_house_value ocean_proximity         id

0                452600.0        NEAR BAY -122192.12

1                358500.0        NEAR BAY -122182.14

2                352100.0        NEAR BAY -122202.15

3                341300.0        NEAR BAY -122212.15

4                342200.0        NEAR BAY -122212.15

5                269700.0        NEAR BAY -122212.15

6                299200.0        NEAR BAY -122212.16

7                241400.0        NEAR BAY -122212.16

8                226700.0        NEAR BAY -122222.16

9                261100.0        NEAR BAY -122212.16

10               281500.0        NEAR BAY -122222.15

11               241800.0        NEAR BAY -122222.15

12               213500.0        NEAR BAY -122222.15

13               191300.0        NEAR BAY -122222.16

14               159200.0        NEAR BAY -122222.15

15               140000.0        NEAR BAY -122222.15

16               152500.0        NEAR BAY -122232.15

17               155500.0        NEAR BAY -122232.15

18               158700.0        NEAR BAY -122222.16

19               162900.0        NEAR BAY -122232.16

20               147500.0        NEAR BAY -122232.15

21               159800.0        NEAR BAY -122232.15

22               113900.0        NEAR BAY -122232.16

23                99700.0        NEAR BAY -122232.16

24               132600.0        NEAR BAY -122232.16

25               107500.0        NEAR BAY -122242.15

26                93800.0        NEAR BAY -122242.15

27               105500.0        NEAR BAY -122242.15

28               108900.0        NEAR BAY -122242.16

29               132000.0        NEAR BAY -122242.16

...                   ...             ...        ...

20610             45500.0          INLAND -121520.90

20611             47000.0          INLAND -121510.90

20612             48300.0          INLAND -121520.92

20613             53400.0          INLAND -121510.91

20614             58000.0          INLAND -121500.92

20615             57500.0          INLAND -121500.92

20616             55100.0          INLAND -121490.92

20617             70800.0          INLAND -121490.94

20618             63400.0          INLAND -121510.94

20619             99100.0          INLAND -121520.99

20620            100000.0          INLAND -121440.95

20621             77500.0          INLAND -121430.99

20622             67000.0          INLAND -121401.00

20623             65500.0          INLAND -121330.97

20624             87200.0          INLAND -121370.96

20625             72000.0          INLAND -121480.88

20626             93800.0          INLAND -121390.82

20627            162500.0          INLAND -121280.87

20628             92400.0          INLAND -121440.90

20629            108300.0          INLAND -121350.88

20630            112000.0          INLAND -121280.71

20631            107200.0          INLAND -121360.67

20632            115600.0          INLAND -121410.74

20633             98300.0          INLAND -121490.81

20634            116800.0          INLAND -121520.73

20635             78100.0          INLAND -121050.52

20636             77100.0          INLAND -121170.51

20637             92300.0          INLAND -121180.57

20638             84700.0          INLAND -121280.57

20639             89400.0          INLAND -121200.63  

[20640 rows x 12 columns]

3.2 使用sk-learn生成随机数据集

一个函数包含从split_train_test到这一行以上的所有内容。。。。

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# random就是Option2 生成随机索引

print(test_set)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \

20046    -119.01     36.06                25.0       1505.0             NaN

3024     -119.46     35.14                30.0       2943.0             NaN

15663    -122.44     37.80                52.0       3830.0             NaN

20484    -118.72     34.28                17.0       3051.0             NaN

9814     -121.93     36.62                34.0       2351.0             NaN

13311    -117.61     34.08                12.0       4427.0             NaN

7113     -118.02     33.89                36.0       1375.0             NaN

7668     -118.08     33.92                38.0       1335.0             NaN

18246    -122.08     37.39                 4.0       2292.0             NaN

5723     -118.23     34.18                45.0       2332.0             NaN

20069    -120.37     38.01                30.0        473.0             NaN

6835     -118.12     34.08                35.0       2248.0             NaN

11351    -117.91     33.76                20.0       4413.0             NaN

20267    -119.19     34.20                18.0       3620.0             NaN

7097     -117.98     33.92                27.0       3700.0             NaN

6298     -117.91     34.02                22.0       6269.0             NaN

696      -122.10     37.69                41.0        746.0             NaN

19607    -121.02     37.48                26.0        467.0             NaN

14173    -117.05     32.75                36.0       2024.0             NaN

19638    -120.97     37.43                27.0       1380.0             NaN

18332    -122.16     37.45                47.0       4234.0             NaN

4691     -118.37     34.07                50.0       2519.0             NaN

2323     -119.73     36.83                 8.0       3602.0             NaN

16880    -122.39     37.59                32.0       4497.0             NaN

14521    -117.14     32.90                16.0       3217.0             NaN

19833    -119.38     36.53                38.0       1281.0             NaN

8383     -118.36     33.96                26.0       3543.0             NaN

2647     -124.13     40.55                38.0        544.0             NaN

1456     -121.98     37.96                22.0       2987.0             NaN

5678     -118.30     33.72                35.0       2790.0             NaN

...          ...       ...                 ...          ...             ...

146      -122.22     37.81                52.0       1971.0           335.0

4989     -118.31     34.00                47.0       1551.0           362.0

16215    -121.34     37.96                27.0       1839.0           442.0

241      -122.21     37.78                52.0       1477.0           300.0

18746    -122.34     40.51                16.0       2247.0           502.0

3595     -118.49     34.24                35.0       2707.0           446.0

8404     -118.36     33.93                30.0       1132.0           347.0

10360    -117.67     33.57                18.0       1614.0           210.0

5714     -118.23     34.21                50.0        309.0            47.0

12401    -116.31     33.65                 8.0       3079.0           558.0

1952     -120.81     38.73                38.0       2005.0           385.0

15800    -122.44     37.76                52.0       2959.0           683.0

1594     -122.12     37.91                34.0       5683.0           755.0

9161     -118.49     34.42                23.0       4166.0           756.0

8478     -118.31     33.93                35.0       1580.0           266.0

10087    -117.97     33.94                36.0       1870.0           338.0

2680     -115.51     32.99                20.0       1402.0           287.0

844      -122.07     37.58                16.0       1644.0           251.0

4779     -118.32     34.04                47.0       1989.0           532.0

11919    -117.42     33.95                32.0       4251.0           848.0

4653     -118.33     34.06                52.0       1368.0           231.0

1783     -122.36     37.94                26.0       1540.0           343.0

7608     -118.26     33.88                40.0        519.0           102.0

0        -122.23     37.88                41.0        880.0           129.0

10155    -117.89     33.89                17.0       1671.0           192.0

15362    -117.22     33.36                16.0       3165.0           482.0

16623    -120.83     35.36                28.0       4323.0           886.0

18086    -122.05     37.31                25.0       4111.0           538.0

2144     -119.76     36.77                36.0       2507.0           466.0

3665     -118.37     34.22                17.0       1787.0           463.0   

       population  households  median_income  median_house_value  \

20046      1392.0       359.0         1.6812             47700.0

3024       1565.0       584.0         2.5313             45800.0

15663      1310.0       963.0         3.4801            500001.0

20484      1705.0       495.0         5.7376            218600.0

9814       1063.0       428.0         3.7250            278000.0

13311      2400.0       843.0         4.7147            158700.0

7113        670.0       221.0         5.0839            198200.0

7668       1011.0       269.0         3.6908            157500.0

18246      1050.0       584.0         4.8036            340000.0

5723        943.0       339.0         8.1132            446600.0

20069       242.0        93.0         2.5417            123200.0

6835       1762.0       622.0         3.0000            253900.0

11351      4818.0      1063.0         2.8594            215100.0

20267      3171.0       779.0         3.3409            220500.0

7097       1793.0       552.0         5.3668            219800.0

6298       5587.0      1251.0         3.8201            136200.0

696         387.0       161.0         3.9063            178400.0

19607       244.0        83.0         4.1346            187500.0

14173      1030.0       390.0         3.8233            139800.0

19638       810.0       262.0         2.1875            137500.0

18332      1808.0      1093.0         4.2297            425000.0

4691       1117.0       516.0         4.3667            405600.0

2323       1959.0       580.0         5.3478            138800.0

16880      1846.0       715.0         6.1323            500001.0

14521      2054.0       687.0         4.2234            162100.0

19833      1423.0       293.0         1.9602             51400.0

8383       2742.0       951.0         2.5504            151300.0

2647        240.0        91.0         3.2500             94800.0

1456       1420.0       540.0         3.6500            204100.0

5678       1167.0       441.0         6.2028            361500.0

...           ...         ...            ...                 ...

146         765.0       308.0         6.5217            273700.0

4989       1329.0       322.0         1.9792            116400.0

16215      2010.0       416.0         2.1284             59400.0

241        1065.0       269.0         1.8472            137000.0

18746      1206.0       463.0         1.9946            119200.0

3595       1224.0       445.0         5.2939            244200.0

8404       1433.0       341.0         2.6800            170000.0

10360       692.0       209.0         7.9294            280300.0

5714        121.0        45.0         6.2130            285000.0

12401      1572.0       474.0         4.5938            102600.0

1952        882.0       353.0         2.5104            120500.0

15800      1145.0       666.0         4.2222            361600.0

1594       1962.0       723.0         8.3678            455300.0

9161       2082.0       743.0         4.4107            213400.0

8478        926.0       282.0         5.0653            158000.0

10087       947.0       324.0         4.1205            217000.0

2680       1104.0       317.0         1.9088             63700.0

844        1033.0       267.0         6.5116            244300.0

4779       1430.0       519.0         1.8333            151100.0

11919      2494.0       798.0         2.8173            110800.0

4653        737.0       248.0         8.3617            433800.0

1783       1007.0       338.0         1.3365             72900.0

7608        330.0        95.0         3.0972            108500.0

0           322.0       126.0         8.3252            452600.0

10155       678.0       206.0        13.1107            467600.0

15362      1351.0       452.0         4.6050            263300.0

16623      1650.0       705.0         2.7266            266800.0

18086      1585.0       568.0         9.2298            500001.0

2144       1227.0       474.0         2.7850             72300.0

3665       1671.0       448.0         3.5521            151500.0   

      ocean_proximity

20046          INLAND

3024           INLAND

15663        NEAR BAY

20484       <1H OCEAN

9814       NEAR OCEAN

13311          INLAND

7113        <1H OCEAN

7668        <1H OCEAN

18246        NEAR BAY

5723        <1H OCEAN

20069          INLAND

6835        <1H OCEAN

11351       <1H OCEAN

20267      NEAR OCEAN

7097        <1H OCEAN

6298        <1H OCEAN

696          NEAR BAY

19607          INLAND

14173      NEAR OCEAN

19638          INLAND

18332        NEAR BAY

4691        <1H OCEAN

2323           INLAND

16880      NEAR OCEAN

14521       <1H OCEAN

19833          INLAND

8383        <1H OCEAN

2647        <1H OCEAN

1456           INLAND

5678       NEAR OCEAN

...               ...

146          NEAR BAY

4989        <1H OCEAN

16215          INLAND

241          NEAR BAY

18746          INLAND

3595        <1H OCEAN

8404        <1H OCEAN

10360       <1H OCEAN

5714        <1H OCEAN

12401          INLAND

1952           INLAND

15800        NEAR BAY

1594         NEAR BAY

9161        <1H OCEAN

8478        <1H OCEAN

10087       <1H OCEAN

2680           INLAND

844          NEAR BAY

4779        <1H OCEAN

11919          INLAND

4653        <1H OCEAN

1783         NEAR BAY

7608        <1H OCEAN

0            NEAR BAY

10155       <1H OCEAN

15362       <1H OCEAN

16623      NEAR OCEAN

18086       <1H OCEAN

2144           INLAND

3665        <1H OCEAN  

[4128 rows x 10 columns]

在数据量很大的情况下使用随机抽样通常都能有效建立test_set和train_set 但是，在数据量不大的情况下随机抽样就会有问题。比如要调查某件事跟性别有关，当地男女比例就是一个很重要的信息，然而，如果我们只有100的志愿者可以调查，随机抽样20%的时候，抽到1个女的，19个男的，这个test的结果就会有很大的倾斜。所以，在数据量小的情况下使用分层抽样保证相应的比例，训练效果会更好。

4.数据分析

房价关联收入 --> 对收入分层抽样，按收入类别预测房价

# Return the ceiling of the input, element-wise.

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)

housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(

    n_splits=1, test_size=0.2, random_state=42)  # 分层采样

# 尝试更多的各个属性的组合 发现更多的数据关联

housing["room_per_household"] = housing["total_rooms"] / housing["households"]

housing["bedrooms_per_house"] = housing["total_bedrooms"] /     housing["total_rooms"]

housing["population_per_household"] = housing["population"] /     housing["households"]

corr_matrix = housing.corr()

print(corr_matrix["median_house_value"].sort_values(ascending=False))

median_house_value          1.000000

median_income               0.688075

income_cat                  0.643892

room_per_household          0.151948

total_rooms                 0.134153

housing_median_age          0.105623

households                  0.065843

total_bedrooms              0.049686

population_per_household   -0.023737

population                 -0.024650

longitude                  -0.045967

latitude                   -0.144160

bedrooms_per_house         -0.255880

Name: median_house_value, dtype: float64

分层采样实例：

假设housing["income_cat"]=[1,1,1,1,1,2,2,22,2,2,2,2,2,2]也就是4个1，10个2，1所占的比例为1/3，2占的比例为2/3；则经过split.split(housing, housing["income_cat"])后strat_train_set [“income_cat”]和strat_test_set[“income_cat”]中1和2，所占的比例相同，分别为1/2，和2/3.也就是说，strat_train_set [“income_cat”]中有4个1，8个2；而strat_test_set[“income_cat”]中有1个1，2个2

for train_index, test_index in split.split(housing, housing["income_cat"]):

    strat_train_set = housing.loc[train_index]

    strat_test_set = housing.loc[test_index]

print(housing["income_cat"].value_counts() / len(housing))

3.0    0.350581

2.0    0.318847

4.0    0.176308

5.0    0.114438

1.0    0.039826

Name: income_cat, dtype: float64

丢弃income_cat属性

沿着行的方向扫描数据（axis=1），丢弃所有行里面income_cat下的数据（也就删除了一列）

for set in (strat_test_set, strat_train_set):

    set.drop(["income_cat"], axis=1, inplace=True)

4.1数据可视化分析

housing = strat_train_set.copy()  # 创建一个用于探索的探索集

# housing.plot(kind = "scatter",x="longitude",y="latitude"); # 绘制经纬度的散点图

housing.plot(kind="scatter", x="longitude",

             y="latitude", alpha=0.1)  # 突出散点图的高密度区域

<matplotlib.axes._subplots.AxesSubplot at 0x23933a6f208>

用人口和收入标识的散点图：

每个⚪的大小代表了人口的数量（参数s），颜色代表价格（参数c）
s表示点点的大小，c就是color jet是一种颜色体系

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,

             s=housing["population"] / 100, label="population",

             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)

<matplotlib.axes._subplots.AxesSubplot at 0x239301d4da0>

使用corr计算所有参数的标准相关系数（皮尔逊相关系数）：

相关系数仅测量线性相关性（“如果x上升，则y上升/下降”），非线性的相关性无法被检测出
相关系数矩阵中，要重点注意的是正相关与负相关数值大的，靠近的0的可以不考虑

corr_matrix = housing.corr()

print(corr_matrix["median_house_value"].sort_values(ascending=False))

median_house_value          1.000000

median_income               0.687160

room_per_household          0.146285

total_rooms                 0.135097

housing_median_age          0.114110

households                  0.064506

total_bedrooms              0.047689

population_per_household   -0.021985

population                 -0.026920

longitude                  -0.047432

latitude                   -0.142724

bedrooms_per_house         -0.259984

Name: median_house_value, dtype: float64

4.11 使用pandas的scatter_matrix绘制相关性

from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income",

              "total_rooms", "housing_median_age"]

# 根据4个属性，绘制一个4X4的相关性散点图。大小大小为12，8

scatter_matrix(housing[attributes], figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000239302A94A8>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933D43A58>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933D77128>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x00000239302EE780>],

       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023930C47E10>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023930C47E48>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023930CA1B70>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023930CD2240>],

       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023930CFA8D0>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933EE4F60>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933F14630>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933F3ECC0>],

       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023933F6D390>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023933F95A20>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x00000239352F80F0>,

        <matplotlib.axes._subplots.AxesSubplot object at 0x000002393531F780>]],

      dtype=object)

分析：

对角线部分：核密度估计图（Kernel Density Estimation），就是用来看某一个变量分布情况，横轴对应着该变量的值，纵轴对应着该变量的密度（可以理解为出现频次）。
非对角线部分：两个变量之间分布的关联散点图。将任意两个变量进行配对，以其中一个为横坐标，另一个为纵坐标，将所有的数据点绘制在图上，用来衡量两个变量的关联度（Correlation）。

# 放大关联性最大的图

housing.plot(kind="scatter", x="median_income",

             y="median_house_value", alpha=0.4)

plt.show()

5.为算法准备数据

5.1 数据清理

5.11取出标签值“median_house_vlaue” -->因为这个是预测值

housing = housing.drop("median_house_value", axis=1)

housing_label = strat_train_set["median_house_value"].copy()

5.12 数据清洗

数据清洗一般有两种策略，一种是直接丢弃不用的数据，第二个是将不好的数据置零/平均数/中位数

下面有3中选择：

选择 1 ->丢弃相应的数据
选择 2 ->丢弃这个属性
选择 3 ->用特殊的数据填补

housing.dropna(subset=["total_bedrooms"])  # subset指定某列进行dropna（数据清洗）

housing.drop("total_bedrooms", axis=1)

median = housing["total_bedrooms"].median()

housing["total_bedrooms"].fillna(median)

17606     351.0

18632     108.0

14650     471.0

3230      371.0

3555     1525.0

19480     588.0

8879      317.0

13685     293.0

4937      465.0

4861      229.0

16365     951.0

19684     559.0

19234     501.0

13956     582.0

2390      495.0

11176     649.0

15614     545.0

2953      251.0

13209     409.0

6569      261.0

5825      913.0

18086     538.0

16718     945.0

13600     278.0

13989     444.0

15168     190.0

6747      563.0

7398      366.0

5562      133.0

16121     416.0

          ...

12380     767.0

5618       24.0

10060     539.0

18067     438.0

4471      797.0

19786     300.0

9969      393.0

14621    1051.0

579       302.0

11682    1615.0

245       460.0

12130     537.0

16441     544.0

11016     428.0

19934     422.0

1364       34.0

1236      829.0

5364      272.0

11703     300.0

10356     449.0

15270     515.0

3754      373.0

12166     756.0

6003      932.0

7364      212.0

6563      236.0

12053     294.0

13908     872.0

11159     380.0

15775     682.0

Name: total_bedrooms, Length: 16512, dtype: float64

5.13使用sk-learn对缺失值处理

from sklearn.impute import SimpleImputer

# 创建Imputer实例，使用median处理缺失数据-->创建一种策略

imputer = SimpleImputer(strategy="median")

# imputer只能处理数值属性，所以先删除Ocean属性

housing_num = housing.drop("ocean_proximity", axis=1)

# fit将inputer适配到数据集,这里会计算所有保留属性的中位数，保存在变量statistics中

imputer.fit(housing_num)

print(imputer.statistics_)

print(housing_num.median().values)

# 替换缺失值，返回一个Pandas DataFrame数组。

X = imputer.transform(housing_num)

# 将DataFrame存到housing_tr中

housing_tr = pd.DataFrame(X, columns=housing_num.columns)

[-1.18510000e+02  3.42600000e+01  2.90000000e+01  2.11950000e+03

  4.33000000e+02  1.16400000e+03  4.08000000e+02  3.54090000e+00

  5.23228423e+00  2.03031374e-01  2.81765270e+00]

[-1.18510000e+02  3.42600000e+01  2.90000000e+01  2.11950000e+03

  4.33000000e+02  1.16400000e+03  4.08000000e+02  3.54090000e+00

  5.23228423e+00  2.03031374e-01  2.81765270e+00]

5.2 处理文本和分类属性

5.21 使用转换器将ocean属性的文本属性转化为数值属性

from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()  # 创建转换器对象

housing_cat = housing["ocean_proximity"]

housing_cat_encoded = encode.fit_transform(housing_cat)  # 使用转换器转换

# print(housing_cat_encoded)

# print(encode.classes_)  # 其实转换器的使用和清洗器imputer是一样的，sklearn内部api具有很强的一致性

使用数据转换之后会发现，原本ocean里面的文本数据只是用来标记类别的，转换器转换之后，类别之间有了大小的关系而这样的大小关系是没有需要的，所以单纯用1234等来标记类别通常是不合理的。分类器做分类时，往往会认为这样的数据是连续并且有序的，用012345来表示各个类别之间关系与原来的关系不一样了。所以这里可以使用独热编码

#   使用独热编码

from sklearn.preprocessing import OneHotEncoder

encode = OneHotEncoder(categories='auto')

# housing_cat_1hout是一个稀疏矩阵

housing_cat_1hot = encode.fit_transform(housing_cat_encoded.reshape(-1, 1))

print(housing_cat_1hot.toarray())             # 稀疏矩阵转化为Numpy数组

[[1. 0. 0. 0. 0.]

 [1. 0. 0. 0. 0.]

 [0. 0. 0. 0. 1.]

 ...

 [0. 1. 0. 0. 0.]

 [1. 0. 0. 0. 0.]

 [0. 0. 0. 1. 0.]]

#  sklearn里面其他的转换方法LabelBinarizer(一次性从文本变成数字及oneHot编码)

from sklearn.preprocessing import LabelBinarizer

encode = LabelBinarizer()  # 如果加参数sparse_output=True下面返回的就是稀疏矩阵

housing_cat_1hot = encode.fit_transform(

    housing_cat)                       # 直接是Numpy类型

print(housing_cat_1hot)

[[1 0 0 0 0]

 [1 0 0 0 0]

 [0 0 0 0 1]

 ...

 [0 1 0 0 0]

 [1 0 0 0 0]

 [0 0 0 1 0]]

5.22 自定义转换器

对下面代码的理解：

首先，按照数据的范围设置界限分别来取room bedroom population以及household的值

然后，attr_adder初始化，调用类初始化，设置要不要每个房子的卧室数目。调用transform函数

对数据进行处理，然后使用np.c_函数进行合并 np.c_是列合并,要求列数一致, np.r_是行合并。
在本例中，转换器有一个超参数add_bedrooms_per_room默认设

置为True（提供合理的默认值通常是很有帮助的）。这个超参数可以

让你轻松知晓添加这个属性是否有助于机器学习的算法。更广泛地

说，如果你对数据准备的步骤没有充分的信心，就可以添加这个超参

数来进行把关。这些数据准备步骤的执行越自动化，你自动尝试的

组合也就越多，从而有更大可能从中找到一个重要的组合（还节省了大

量时间）。

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombineAttributesAdder(BaseEstimator, TransformerMixin):

    def __init__(self, add_bedrooms_per_room=True):

        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):

        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]

        population_per_household = X[:, population_ix]/X[:, household_ix]

        if self.add_bedrooms_per_room:

            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]

            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

        else:

            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombineAttributesAdder(add_bedrooms_per_room=False)

housing_extra_arrtibs = attr_adder.transform(housing.values)

print(housing_extra_arrtibs)

[[-121.89 37.29 38.0 ... 2.094395280235988 4.625368731563422

  2.094395280235988]

 [-121.93 37.05 14.0 ... 2.7079646017699117 6.008849557522124

  2.7079646017699117]

 [-117.2 32.77 31.0 ... 2.0259740259740258 4.225108225108225

  2.0259740259740258]

 ...

 [-116.4 34.09 9.0 ... 2.742483660130719 6.34640522875817

  2.742483660130719]

 [-118.01 33.82 31.0 ... 3.808988764044944 5.50561797752809

  3.808988764044944]

 [-122.45 37.77 52.0 ... 1.9859154929577465 4.843505477308295

  1.9859154929577465]]

5.3 特征缩放

分析：

特征缩放常用于输入的数值属性有很大的比例差异的时候，因为这种差异往往会导致算法性能下降。

当然也有特例，具体问题具体分析。在这个例子里面，地区房间的总数在（6，39320）之间，

而收入的中位数在（0-15）之间，所以需要特征缩放。
特征缩放常用的两种方法：（两种估算器）
1. 最小最大缩放
  
  最大最小缩放又叫归一化，目的是将值的范围缩小到0-1之间。（不一定要是0-1）
  
  实现方法是将值减去min并除以(max-min)。
  
  在sklearn中的实现是MinMaxScaler方法，可以使用feature_range修改（0-1）的范围。
2. 标准化
  
  标准化是用当前值减去平均值再除以反差,让结果分布具备单位方差。
  
  标准化没有固定的范围，其特点是受异常值的影响更小。
  
  在sklearn中的实现是StandadScaler方法。

5.4 使用流水线转换数据

5.41流水线示例：

缺失值处理-> 合并新属性->标准化（结构：转换器-转换器-估算器，除最后一个外必须是转换器）
调用方法：当流水线调用fit_transform方法时，对于前两个转换器，会先调用fit（）方法，直到到转换器，到转换器调用transform方法。所以对于这种流水线直接调用fit_transform即可。

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

# Scikit-Learn中没有可以用来处理Pandas DataFrames的,因此我们需要为此任务编写一个简单的自定义转换器：

class DataFrameSelector(BaseEstimator, TransformerMixin):

    def __init__(self, attribute_names):

        self.attribute_names = attribute_names

    def fit(self, X, y=None):

        return self

    def transform(self, X):

        return X[self.attribute_names].values

num_pipline = Pipeline([

    ('Simpleimputer', SimpleImputer(strategy="median")),

    ('attribs_adder', CombineAttributesAdder()),

    ('std_scaler', StandardScaler()),

])

housing_num_str = num_pipline.fit_transform(housing_num)

print("-"*30)

print(housing_num_str)

------------------------------

[[-1.15604281  0.77194962  0.74333089 ... -0.31205452 -0.08649871

   0.15531753]

 [-1.17602483  0.6596948  -1.1653172  ...  0.21768338 -0.03353391

  -0.83628902]

 [ 1.18684903 -1.34218285  0.18664186 ... -0.46531516 -0.09240499

   0.4222004 ]

 ...

 [ 1.58648943 -0.72478134 -1.56295222 ...  0.3469342  -0.03055414

  -0.52177644]

 [ 0.78221312 -0.85106801  0.18664186 ...  0.02499488  0.06150916

  -0.30340741]

 [-1.43579109  0.99645926  1.85670895 ... -0.22852947 -0.09586294

   0.10180567]]

5.42完整的流水线处理

完整的流水线处理包含了缺失值处理-> 合并新属性->标准化->字符数据独热编码
下面的selector是数据选择
sklearn 的FeatureUnion类最终可以合并所有流水线的所产生的数据

from sklearn.pipeline import FeatureUnion

from sklearn_features.transformers import DataFrameSelector

num_attribs = list(housing_num)

cat_attribs = ["ocean_proximity"]

# 第一条流水线

num_pipeline = Pipeline([

    ('selector', DataFrameSelector(num_attribs)),

    ('Simpleimputer', SimpleImputer(strategy="median")),

    ('attribs_adder', CombineAttributesAdder()),

    ('std_scaler', StandardScaler()),

])

# 第二条流水线

# cat_pipeline = Pipeline([

#     ('selector',DataFrameSelector(cat_attribs)),

#     ('label_binarizer',LabelBinarizer()),

# ])

# * 这里书上的代码旧了，sklearn0.19重写了fit_tansforms,新的trans_form只接收两个参数，

# 流水线执行LabelBinarizer会传入3个参数，重写一个LabelBinarizer方法

from sklearn.base import TransformerMixin

class MyLabelBinarizer(TransformerMixin):

    def __init__(self, *args, **kwargs):

        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):

        self.encoder.fit(x)

        return self

    def transform(self, x, y=0):

        return self.encoder.transform(x)

cat_pipeline = Pipeline([

    ('selector', DataFrameSelector(cat_attribs)),

    ('label_binarizer', MyLabelBinarizer()),

])

# 合并流水线

full_pipeline = FeatureUnion(transformer_list=[

    ("num_pipline", num_pipeline),

    ("cat_pipline", cat_pipeline),

])

# 运行流水线

housing_prepared = full_pipeline.fit_transform(housing)

print(housing_prepared)

print(housing_prepared.shape)

[[-1.15604281  0.77194962  0.74333089 ...  0.          0.

   0.        ]

 [-1.17602483  0.6596948  -1.1653172  ...  0.          0.

   0.        ]

 [ 1.18684903 -1.34218285  0.18664186 ...  0.          0.

   1.        ]

 ...

 [ 1.58648943 -0.72478134 -1.56295222 ...  0.          0.

   0.        ]

 [ 0.78221312 -0.85106801  0.18664186 ...  0.          0.

   0.        ]

 [-1.43579109  0.99645926  1.85670895 ...  0.          1.

   0.        ]]

(16512, 19)

6 模型选择和训练

6.1 训练训练集

线性回归模型：LR

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(housing_prepared, housing_label)

# 取前5行

some_data = housing.iloc[:5]

some_labels = housing_label.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:\t", lin_reg.predict(some_data_prepared))

print("Labels:\t\t", list(some_labels))

print(lin_reg.intercept_)

print(lin_reg.coef_)

Predictions:	 [209420.50610494 315409.32621299 210124.77314125  55983.75406116

 183462.63421725]

Labels:		 [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

235473.80836449962

[-56129.06165758 -56723.67757798  13971.77259524   7327.89108513

   2200.80894803 -45937.59348295  41468.93537123  78337.91915705

   3575.6306461   19109.54513283    447.54395435   3575.6306461

    447.54395435  -2825.3656443  -17506.92104904 -51684.61814988

 105578.16342067 -22242.22267579 -14144.40154595]

# 对线性回归模型执行交叉验证

from sklearn.model_selection import cross_val_score

lin_score = cross_val_score(lin_reg, housing_prepared,

                            housing_label, scoring="neg_mean_squared_error", cv=10)

lin_rmse_score = np.sqrt(-lin_score)

print(lin_rmse_score)

print("Mean:", lin_rmse_score.mean())

print("Standard deviation:", lin_rmse_score.std())

[66062.46546015 66793.78724541 67644.87711878 74702.95282053

 68054.75502851 70902.35184092 64171.47270772 68081.38734615

 71042.4918974  67281.01437174]

Mean: 68473.75558372994

Standard deviation: 2844.0256903763307

6.2 均方误差

到此，预测已经结束，但是观察上面的Prediction和Labels的大小，还有很大的误差。

下面使用均方误差来评估和调整模型。

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)

lin_mse = mean_squared_error(housing_label, housing_predictions)

lin_rmse = np.sqrt(lin_mse)

print(lin_rmse)

68147.95744947501

6.3 决策树模型DTR

LR的均方误差很大，拟合不够。所以换一个模型测试一下。

# * LR的均方误差很大，拟合不够。 所以换一个模型测试一下。

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

tree_reg.fit(housing_prepared, housing_label)

housing_predictions = tree_reg.predict(housing_prepared)

tree_mse = mean_squared_error(housing_label, housing_predictions)

tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

0.0

6.4 交叉验证

使用决策树模型发现误差为0，这代表肯定过拟合了。
下面使用交叉验证来更好的评估决策树模型。

from sklearn.model_selection import cross_val_score

# 十折交叉验证

score = cross_val_score(tree_reg, housing_prepared,

                        housing_label, scoring="neg_mean_squared_error", cv=10)

rmse_score = np.sqrt(-score)

print(rmse_score)

print("Mean:", rmse_score.mean())

print("Standard deviation:", rmse_score.std())

# 这里的输出指：交叉验证决策树获得的分数是rmse_score.mean(),上下浮动rmse_score.std()个数值。

[70190.39079163 66064.66678933 70919.2241942  69755.78097769

 71539.03243522 74039.67372515 70437.47726481 70239.92318825

 75076.56450274 69247.65253169]

Mean: 70751.0386400731

Standard deviation: 2367.7244306133935

6.5 随机森林

# 上面两个模型都不尽如人意，所以测试第三个模型：RF

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()

forest_reg.fit(housing_prepared, housing_label)

housing_predictions = forest_reg.predict(housing_prepared)

score = cross_val_score(forest_reg, housing_prepared,

                        housing_label, scoring="neg_mean_squared_error", cv=10)

forest_rmse_score = np.sqrt(-score)

print(forest_rmse_score)

print("Mean:", forest_rmse_score.mean())

print("Standard deviation:", forest_rmse_score.std())

print("Predictions:\t", forest_reg.predict(some_data_prepared))  # 预测样本

print("Labels:\t\t", list(some_labels))         # 实际值

D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.

  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

[52198.48245471 49190.66608383 52933.46838462 55013.84784886

 51547.27239792 56007.14126735 51188.89643343 51095.69872494

 55675.80059914 51441.22938544]

Mean: 52629.25035802469

Standard deviation: 2132.8000877264612

Predictions:	 [261400. 327800. 247480.  46540. 238250.]

Labels:		 [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

6.6 模型保存

训练好的模型要进行保存，相关结果也要进行保存。
使用python的pickel模块或sklearn.externals.joblib模块保存模型。

from sklearn.externals import joblib

joblib.dump(forest_reg, "my_model.pkl")         # 保存

my_model_loaded = joblib.load("my_model.pkl")  # 导入V

7.模型优化

7.1 网格搜索

除了手动操作调整超参数，还可以通过算法自动调整超参数，第一个就是网格搜索方法（GridSearchCV)

网格搜索是一种穷举的搜索方案，即将所有参数的所有可能的组合组成一个表格，对各个参数逐一计算。

网格搜索的结果好坏与数据集的划分有很大的关系，所以网格搜索常常与交叉验证一起使用。

Grid Search调参方法存在的弊端是参数越多，候选值越多，耗费时间越长。所以，一般情况下，先定一个大范围，然后再细化。

from sklearn.model_selection import GridSearchCV

para_grid = [

    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},   # 3*4网格搜索，12种

    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]} # 2*3 6种 共18种组合

]

forest_reg2 = RandomForestRegressor()

# 配合五折交叉搜索，18*5 = 90 次，总共进行了90次的模型训练

grid_search = GridSearchCV(forest_reg2,para_grid,cv=5,scoring="neg_mean_squared_error")

grid_search.fit(housing_prepared,housing_label)

print(grid_search.best_params_)  # 打印最佳参数

print(grid_search.best_estimator_) # 最佳模型

cvres = grid_search.cv_results_     # 评估分数

for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):

    print(np.sqrt(-mean_score),params)

{'max_features': 8, 'n_estimators': 30}

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,

           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,

           min_impurity_split=None, min_samples_leaf=1,

           min_samples_split=2, min_weight_fraction_leaf=0.0,

           n_estimators=30, n_jobs=None, oob_score=False,

           random_state=None, verbose=0, warm_start=False)

64284.877079430145 {'max_features': 2, 'n_estimators': 3}

56787.08219950486 {'max_features': 2, 'n_estimators': 10}

54634.8597925206 {'max_features': 2, 'n_estimators': 30}

62625.13023634532 {'max_features': 4, 'n_estimators': 3}

54178.71450231148 {'max_features': 4, 'n_estimators': 10}

52166.9297650929 {'max_features': 4, 'n_estimators': 30}

60632.06160053902 {'max_features': 6, 'n_estimators': 3}

53671.382898481956 {'max_features': 6, 'n_estimators': 10}

51209.764682060915 {'max_features': 6, 'n_estimators': 30}

60205.08228898087 {'max_features': 8, 'n_estimators': 3}

53046.02597867227 {'max_features': 8, 'n_estimators': 10}

50988.49917680898 {'max_features': 8, 'n_estimators': 30}

64237.4676969949 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}

55863.58989745313 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}

61235.68857900436 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}

54690.58049493668 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

59267.34518636521 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}

53190.09578420519 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

7.2 随机搜索

当需要搜索的组合很多时，就不适合使用表格搜索了。此时通常使用随机搜索（RandomizedSearchCV)

随机搜索不会穷尽所有参数，它会在参数空间随机采样。
随机搜索策略：

对于搜索范围是distribution的超参数，根据给定的distribution随机采样；
对于搜索范围是list的超参数，在给定的list中等概率采样；
对a、b两步中得到的n_iter组采样结果，进行遍历。
（补充）如果给定的搜索范围均为list，则不放回抽样n_iter次。

随机搜索的优点：

如果运行随机搜索1000个迭代，那么将会探索每个超参数的1000个不同的值（而不是像网格搜索方法那样每个超参数仅探索少量几个值）。

通过简单地设置迭代次数，可以更好地控制要分配给探索的超参数的计算预算。

7.3 集成方法

还有一种调整方法就是将各个好的模型集成起来，组合成一个集成模型。（类似随机森林）

7.4 模型分析

from sklearn.preprocessing import LabelEncoder

feature_importances=grid_search.best_estimator_.feature_importances_

print(feature_importances)

encoder=LabelEncoder()

housing_cat=housing["ocean_proximity"]

housing_cat_encoded=encoder.fit_transform(housing_cat)

extra_attribs=["rooms_per_household","pop_per_household","bedrooms_per_room"]

cat_one_hot_attribs=list(encoder.classes_)

attribus=num_attribs+extra_attribs+cat_one_hot_attribs

sorted(zip(feature_importances,attribus),reverse=True)

[6.49886314e-02 5.75244728e-02 3.99323296e-02 1.34508217e-02

 1.30533240e-02 1.29077646e-02 1.32840680e-02 3.34406458e-01

 3.95819895e-02 5.44151119e-02 6.77111178e-02 2.95775829e-02

 5.97611143e-02 2.87285632e-02 8.84117371e-03 1.54870133e-01

 1.39273664e-04 3.15616780e-03 3.66990154e-03]

[(0.3344064582565652, 'median_income'),

 (0.1548701334803861, 'INLAND'),

 (0.06771111779862625, 'population_per_household'),

 (0.06498863136588338, 'longitude'),

 (0.059761114331979774, 'pop_per_household'),

 (0.05752447280722709, 'latitude'),

 (0.05441511192586514, 'bedrooms_per_house'),

 (0.039932329606253154, 'housing_median_age'),

 (0.03958198945470592, 'room_per_household'),

 (0.02957758287533781, 'rooms_per_household'),

 (0.028728563174464258, 'bedrooms_per_room'),

 (0.013450821682219134, 'total_rooms'),

 (0.013284067951082463, 'households'),

 (0.013053323968912556, 'total_bedrooms'),

 (0.012907764611245971, 'population'),

 (0.008841173709972056, '<1H OCEAN'),

 (0.003669901537931307, 'NEAR OCEAN'),

 (0.0031561677969219157, 'NEAR BAY'),

 (0.00013927366442045662, 'ISLAND')]

8. 测试模型

final_model = grid_search.best_estimator_

x_test = strat_test_set.drop("median_house_value",axis=1)

y_test = strat_test_set["median_house_value"].copy()

x_test_prepared = full_pipeline.transform(x_test)

final_predications = final_model.predict(x_test_prepared)

final_mse = mean_squared_error(y_test,final_predications)

final_rmse = np.sqrt(final_mse)

print(final_rmse)

48557.63204741025

8.1 计算95%的置信区间

from scipy import stats

confidence = 0.95

squared_errors = (final_predications-y_test)**2  # 平方误差

mean = squared_errors.mean()

m = len(squared_errors)

confidence_interval = np.sqrt(stats.t.interval(confidence, m-1, loc=np.mean(squared_errors),

                                               scale=stats.sem(squared_errors)))

print(confidence_interval)

9. 课后练习

9.1 课后练习1：SVR预测器

使用网格搜索寻找SVR预测器中最好的参数

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVR

para_grid = [

    {'kernel':['linear'],'C':[10.,30.,100.,300.,1000.,3000.,10000.,30000.0]},

    {'kernel':['rbf'],'C':[1.0,3.0,10.,30.,100.,300.,1000.0],

    'gamma':[0.01,0.03,0.1,0.3,1.0,3.0]},

]                                                       # 参数列表

svm_reg = SVR()

grid_search = GridSearchCV(svm_reg,para_grid,cv=5,scoring = 'neg_mean_squared_error',verbose = 2,n_jobs=4)

grid_search.fit(housing_prepared,housing_label)

negative_mse = grid_search.best_score_

rmse = np.sqrt(-negative_mse)

print(rmse)

print(grid_search.best_params_)

9.2 课后练习2：替换随机搜索调参方法

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import expon, reciprocal

param_distribs = {

        'kernel': ['linear', 'rbf'],

        'C': reciprocal(20, 200000),

        'gamma': expon(scale=1.0),

    }

svm_reg = SVR()

rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,

                                n_iter=50, cv=5, scoring='neg_mean_squared_error',

                                verbose=2, n_jobs=4, random_state=42)

rnd_search.fit(housing_prepared, housing_label)

negative_mse = rnd_search.best_score_

rmse = np.sqrr(-negative_mse)

print(rmse)

Hand on Machine Learning 第二章：端到端的机器学习的更多相关文章

Machine Learning|Andrew Ng|Coursera 吴恩达机器学习笔记
Week1: Machine Learning: A computer program is said to learn from experience E with respect to some ...
Machine Learning|Andrew Ng|Coursera 吴恩达机器学习笔记(完结)
Week 1: Machine Learning: A computer program is said to learn from experience E with respect to some ...
Machine Learning第十周笔记：大规模机器学习
博客已经迁移到Marcovaldo's blog (http://marcovaldong.github.io/) 刚刚完毕了Andrew Ng在Cousera上的Machine Learning的第 ...
Coursera machine learning 第二周编程作业 Linear Regression
必做: [*] warmUpExercise.m - Simple example function in Octave/MATLAB[*] plotData.m - Function to disp ...
Coursera machine learning 第二周 quiz 答案 Linear Regression with Multiple Variables
https://www.coursera.org/learn/machine-learning/exam/7pytE/linear-regression-with-multiple-variables ...
Coursera machine learning 第二周 quiz 答案 Octave/Matlab Tutorial
https://www.coursera.org/learn/machine-learning/exam/dbM1J/octave-matlab-tutorial Octave Tutorial 5 ...
machine learning for hacker记录(1) R与机器学习
开篇:首先这本书的名字很霸气,全书内容讲的是R语言在机器学习上面的应用,一些基本的分类算法(tree,SVM,NB),回归算法,智能优化算法,维度约减等,机器学习领域已经有很多成熟的R工具箱,毕竟这个 ...
Hands on Machine Learning with Sklearn and TensorFlow学习笔记——机器学习概览
一.什么是机器学习? 计算机程序利用经验E(训练数据)学习任务T(要做什么,即目标),性能是P(性能指标),如果针对任务T的性能P随着经验E不断增长,成为机器学习.[这是汤姆米切尔在1997年定义] ...
机器学习(Machine Learning)&深度学习(Deep Learning)资料(Chapter 2)
##机器学习(Machine Learning)&深度学习(Deep Learning)资料(Chapter 2)---#####注:机器学习资料[篇目一](https://github.co ...

随机推荐

已知链表头结点指针head,写一个函数把这个链表逆序
Node* ReverseList ( Node *head ) { if ( head == NULL || head->next == NULL ) return head; Node *p ...
CRMEasy知识库访问权限
知识库设置BCCBINFO文件夹为共享文件夹,给予普通域用户和管理员域用户完全控制权限,但是当以管理员域用户身份对文件进行操作后,以普通域用户登录不会有效果,甚至出现“访问权限”的错误,这是由于文件夹 ...
IP地址转二进制
例如216这个数 216 = 1 * 128 +1 * 64 + 0 * 32 + 1 * 16 + 1 * 8 + 0 * 4 + 0 * 2 + 0 * 1 提取乘号之前的数字我们得到所代表的二进 ...
unkown类型
1,任何类型的值都可以赋给 unkown类型 2. 如果没有类型断言或基于控制流的类型细化时 unknown 不可以赋值给其它类型,此时它只能赋值给 unknown 和 any 类型 3. 如果没有类 ...
常见BUG
1.没有配置Tomcat服务,由于 <exclusions> <exclusion> <groupId>org.springframework.boot</g ...
Django【第10篇】：Django之分页初级版本
分页和中间件一.分页 Django的分页器(paginator) view.py from django.shortcuts import render,HttpResponse # Create ...
Django【第9篇】：Django之用户认证auth模块
用户认证--------------auth模块一.auth模块 from django.contrib import auth 1 .authenticate() :验证用户输入的用户名和密码 ...
Mybatis foreach批量插入
1.foreach的属性 item:集合中元素迭代时的别名,必填 index:在list和array中,index是元素的序号:在map中,index是元素的key,可选 open:foreach代码 ...
Golang在京东列表页实践总结
Golang在京东列表页实践总结作者:张洪涛 10余年软件开发和设计经验,曾就职于搜狐.搜狗.前matrixjoy公司联合创始人.甘普科技CTO. 目前线上状态基于搜索实现: 全量数据,搜索结果不 ...
@media兼容iphone4、5、6
在网页中,pixel与point比值称为device-pixel-ratio,普通设备都是1,iPhone 4是2,有些Android机型是1.5. 那么-webkit-min-device-pixe ...

Hand on Machine Learning 第二章：端到端的机器学习