# -*- coding: utf-8 -*-
"""
Created on Sat Aug 18 16:23:17 2018

@author: acadsoc
"""
import scipy
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_predict, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score
from sklearn.grid_search import RandomizedSearchCV
from sklearn.linear_model import  Lasso, LassoCV, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from statsmodels.formula import api as smf
import sys
import os

plt.style.use('ggplot') # 设置ggplot2画图风格
# 根据不同平台设置其中文字体路径
if sys.platform == 'linux':
    zh_font = matplotlib.font_manager.FontProperties(
        fname='/path/anaconda3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/STZHONGS.TTF')
else:
    zh_font = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\STZHONGS.ttf')  # 设置中文字体

# 根据不同平台设定工作目录
if sys.platform == 'linux':
    os.chdir('path/jupyternb/ml/acadsoc/rollingRegression') # Linux path
else:
    os.chdir('D:/Python/rollingRegression') # Windows path

class featureSelection():
    '''
    多元线性回归特征选择类。
    
    参数
    ----
    random_state : int,默认是None
        随机种子。
        
    属性
    ----
    elasticnet_rs_best : model
        弹性网络随机搜索最佳模型。
    elasticnet_rs_feat_selected_ : dataframe
        弹性网络随机搜索最佳模型选择的系数大于0的变量。
    elasticnet_rs_R2_ : float
        弹性网络随机搜索最佳模型Rsquared。
    eln : model
        弹性网络。
    elasticnet_coef_ : dataframe
        弹性网络系数。
    elasticnet_feat_selected_ : list
        弹性网络选择系数大于0的变量。
    elasticnet_feat_ : float
        弹性网络Rsquared。
    rf_rs_best : model
        随机森林随机搜索最佳模型。
    rf_rs_feat_impo_ : dataframe
        随机森林随机搜索变量重要性排序。
    rf_rs_feat_selected_ : list
        随机森林随机搜索累积重要性大于impo_cum_threshold的变量列表。
    rf_rs_R2_ : float
        随机森林随机搜索Rsquared。
    rf_feat_impo_ : dataframe
        随机森林变量重要性排序。
    rf_feat_selected_ : list
        随机森林累积重要性大于impo_cum_threshold的变量列表。
    rf_R2_ : float
        随机森林Rsquared。
    stepwise_model : model
        逐步回归模型。
    '''      
    def __init__(self, random_state=None):
        self.random_state = random_state # 随机种子

def elasticNetRandomSearch(self, df, cv=10, n_iter=1000, n_jobs=-1, intercept=True,
                               normalize=True):
        '''
        ElasticNet随机搜索,搜索最佳模型。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        cv : int, 默认是10
            交叉验证次数。
        n_iter : int, 默认是1000
            最大迭代次数。
        n_jobs : int, 默认是-1
            使用cpu线程数,默认为-1,表示所有线程。
        intercept : bool, 默认是True
            是否有截距项。
        normalize : bool, 默认是True
            是否将数据标准化。  
        '''
        if normalize: # 如果需要标准化数据
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)
            
        X = df.iloc[:, 1:]
        y = df.iloc[:, 0]
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        eln = ElasticNet(fit_intercept=intercept)
        param_rs = {'alpha' : scipy.stats.expon(loc=0, scale=1),  # 模型需搜索的参数
                    'l1_ratio' : scipy.stats.uniform(loc=0, scale=1)}
        
        elasticnet_rs = RandomizedSearchCV(eln,  # 建立随机搜索
                                param_distributions=param_rs,
                                scoring='r2',
                                cv=cv,
                                n_iter=n_iter,
                                n_jobs=n_jobs)
        elasticnet_rs.fit(X, y)  # 模型训练        
        # 用最佳模型进行变量筛选变量、系数  
        self.elasticnet_rs_best = ElasticNet(alpha=elasticnet_rs.best_params_['alpha'],
                                          l1_ratio = elasticnet_rs.best_params_['l1_ratio'])
        self.elasticnet_rs_best.fit(X, y)
        coef = pd.DataFrame(self.elasticnet_rs_best.coef_, index=df.columns[1:],
                            columns=['系数']).sort_values(by='系数', axis=0, ascending=False)
        self.elasticnet_rs_feat_selected_ = coef[coef > 0].dropna(axis=1).columns
        self.elasticnet_rs_R2_ = 1 - np.mean((y.values.reshape(-1,1) -
                                              self.elasticnet_rs_best.predict(X).reshape(-1,1)) ** 2) / np.var(y)
        return self    
    
    def elasticNetFeatureSelectPlot(self, df, l1_ratio=.7, normalize=True, intercept=True,
                                    plot_width=12, plot_height=5, xlim_exp=[-5, 1], ylim=[-1, 1]):
        '''
        绘制ElasticNet正则化效果图。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        l1_ratio : float, 默认是0.7
            l1正则化率。
        normalize : bool, 默认是True
            是否将数据标准化。  
        intercept : bool, 默认是True
            回归方程是否有常数项。
        plot_width : int, 默认是12
            画板宽度。
        plot_height : int, 默认是5
            画板高度。
        xlim_exp : list, 默认是[-5, 1]
            x轴显示指数取值范围。
        ylim : list, 默认是[-1, 1]
            y轴显示取值范围。
        '''
        if normalize: # 如果需要标准化数据
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)  
        
        X = df.iloc[:, 1:]
        y = df.iloc[:, 0]
        
        plt.figure(figsize=(plot_width, plot_height))
        ax = plt.subplot(111)
        colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'pink', 'lightgreen',
                  'lightblue', 'gray', 'indigo', 'orange', 'seagreen', 'gold', 'purple']
        weights, params = [], []
        for alpha in np.arange(-5, 1, 0.1, dtype=float):
            eln = ElasticNet(alpha=10 ** alpha, l1_ratio=l1_ratio, random_state=123,
                             fit_intercept=intercept)
            eln.fit(X, y)
            weights.append(eln.coef_)
            params.append(10 ** alpha)
        
        weights = np.array(weights)
        for column, color in zip(range(weights.shape[1]), colors):
            plt.plot(params, weights[:, column], label=df.columns[column + 1], color=color)
        
        plt.axhline(0, color='black', linestyle='--', linewidth=3)
        plt.xlim(10 ** xlim_exp[0], 10 ** xlim_exp[1])
        plt.ylim(ylim)
        plt.title('弹性网络变量选择图', fontproperties=zh_font)
        plt.ylabel('权重系数', fontproperties=zh_font)
        plt.xlabel('$alpha$')
        plt.xscale('log')
        plt.xticks(10 ** np.arange(xlim_exp[0], xlim_exp[1], dtype=float),
                   10 ** np.arange(xlim_exp[0], xlim_exp[1], dtype=float))
        plt.legend(loc='best', prop=zh_font)
        ax.legend(prop=zh_font)
        #plt.grid()
        plt.show()
        return self
    
    ''''''
    def elasticNet(self, df, feat_selected=None, alpha=1, l1_ratio=.7, intercept=True, normalize=False):
        '''
        ElasticNet回归分析。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        alpha : float, 默认是1
            alpha。
        l1_ratio : float, 默认是0.7
            l1正则化率。
        intercept : bool, 默认是True
            是否有截距项。
        normalize : bool, 默认是True
            是否将数据标准化。          
        '''
        if normalize: # 如果需要标准化数据
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)  
        
        if feat_selected is not None:  # 如果输入了选择好的变量
            X = df[feat_selected]
        else:            
            X = df.iloc[:, 1:]
        y = df.iloc[:, 0]
        
        self.eln = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=intercept)
        self.eln.fit(X, y)  # 模型训练
        
        # 变量、系数,R2
        self.elasticnet_coef_ = pd.DataFrame(self.eln.coef_, index = X.columns,
                            columns=['系数']).sort_values(by='系数', ascending=False)
        self.elasticnet_feat_selected_ = self.elasticnet_coef_[self.elasticnet_coef_ > 0].dropna(axis=0).index
        self.elasticnet_R2_ = 1 - np.mean((y.values.reshape(-1,1) -
                                           self.eln.predict(X).reshape(-1,1)) ** 2) / np.var(y)
        return self        
    
    def featureBarhPlot(self, df_coef, figsize=(12, 6)):   
        '''
        画特征条形图(纵向排列)。
        
        参数
        ----
        df_coef : dataframe
            特征系数(重要性)数据框。
        fitsize : tuple, 默认是(12, 6)
            画布宽高。
        '''       
        coef = df_coef.sort_values(by=df_coef.columns[0], axis=0, ascending=True)
        plt.figure(figsize=figsize)
        y_label = np.arange(len(coef))
        plt.barh(y_label, coef.iloc[:, 0])
        plt.yticks(y_label, coef.index, fontproperties=zh_font)
        
        for i in np.arange(len(coef)):
            if coef.iloc[i, 0] >= 0:
                dist = 0.003 * coef.iloc[:, 0].max()
            else:
                dist = -0.02 * coef.iloc[:, 0].max()
            plt.text(coef.iloc[i, 0] + dist, i - 0.2, '%.3f' % coef.iloc[i, 0], fontproperties=zh_font)
            
        # plt.grid()
        plt.ylabel('特征', fontproperties=zh_font)
        plt.xlabel('特征系数', fontproperties=zh_font)
        plt.title('特征系数条形图', fontproperties=zh_font)
        plt.legend(prop=zh_font)
        plt.show()         
    
    def randomForestRandomSearch(self, df, cv=10, n_iter=100, n_jobs=-1, impo_cum_threshold=.85,
                                 normalize=True):
        '''
        RandomForest随机搜索,搜索最佳模型。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        cv : int, 默认是10
            交叉验证次数。
        n_iter : int, 默认是100
            最大迭代次数。
        n_jobs : int, 默认是-1
            使用cpu线程数,默认为-1,表示所有线程。
        impo_cum_threshold : float, 默认是0.85
            按累积重要性选择变量阈值。
        normalize : bool, 默认是True
            是否将数据标准化。
        '''
        if normalize: # 如果需要标准化数据
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)  
            
        X = df.iloc[:, 1:]
        y = df.iloc[:, 0]
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        rf = RandomForestRegressor()
        param_rs = {'n_estimators' : np.arange(1, 500),  # 模型需搜索的参数
                    'max_features' : np.arange(1, X.shape[1] + 1)}
        
        rf_rs = RandomizedSearchCV(rf,  # 建立随机搜索
                                param_distributions=param_rs,
                                scoring='r2',
                                cv=cv,
                                n_iter=n_iter,
                                n_jobs=n_jobs)
        rf_rs.fit(X, y)  # 模型训练        
        # 用最佳模型进行变量筛选变量、系数  
        self.rf_rs_best = RandomForestRegressor(n_estimators=rf_rs.best_params_['n_estimators'],
                                          max_features=rf_rs.best_params_['max_features'])
        self.rf_rs_best.fit(X, y)
        self.rf_rs_feat_impo_ = pd.DataFrame(self.rf_rs_best.feature_importances_, index = df.columns[1:],
                            columns=['系数']).sort_values(by='系数', axis=0, ascending=False)
        
        n = 0
        for i, v in enumerate(self.rf_rs_feat_impo_.values.cumsum()):
            if v >= impo_cum_threshold:
                n = i
                break
                
        self.rf_rs_feat_selected_ = self.rf_rs_feat_impo_.index[:n+1]           
        self.rf_rs_R2_ = 1 - np.mean((y.values.reshape(-1,1) - \
                                              self.rf_rs_best.predict(X).reshape(-1,1)) ** 2) / np.var(y)
        return self
    
    def randomForest(self, df, feat_selected=None, impo_cum_threshold=.85,
                     n_estimators=100, max_features='auto', normalize=False):
        '''
        Randomforest回归分析。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        feat_selected : list, 默认是None
            选择的特征。
        impo_cum_threshold : float, 默认是0.85
            按累积重要性选择变量阈值。
        n_estimators : int, 默认是100
            森林含树数。
        max_features : int, 默认是'auto'
            每课时最大选择特征数。        
        normalize : bool, 默认是True
            是否将数据标准化。
        '''    
        if normalize: # 如果需要标准化数据
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)  
        
        if feat_selected is not None:  # 如果输入了选择好的变量
            X = df[feat_selected]
        else:            
            X = df.iloc[:, 1:]
        y = df.iloc[:, 0]
        
        self.rf = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features)
        self.rf.fit(X, y)  # 模型训练
        
        # 变量、系数,R2
        self.rf_feat_impo_ = pd.DataFrame(self.rf.feature_importances_, index = X.columns,
                            columns=['系数']).sort_values(by='系数', ascending=False)
        
        n = 0
        for i, v in enumerate(self.rf_feat_impo_.values.cumsum()):
            if v >= impo_cum_threshold:
                n = i
                break
                
        self.rf_feat_selected_ = self.rf_feat_impo_.index[:n+1]      
        self.rf_R2_ = 1 - np.mean((y.values.reshape(-1,1) - self.rf.predict(X).reshape(-1,1)) ** 2) / np.var(y)
        return self    
    
    def stepwise(self, df, response, intercept=True, normalize=False, criterion='bic',
                 f_pvalue_enter=.05, p_value_enter=.05, direction='backward', show_step=True,
                 criterion_enter=None, criterion_remove=None,max_iter=200, **kw):
                 
        '''
        逐步回归。
        
        参数
        ----
        df : dataframe
            分析用数据框,response为第一列。
        response : str
            回归分析相应变量。
        intercept : bool, 默认是True
            模型是否有截距项。
        criterion : str, 默认是'bic'
            逐步回归优化规则。
        f_pvalue_enter : float, 默认是.05
            当选择criterion=’ssr‘时,模型加入或移除变量的f_pvalue阈值。
        p_value_enter : float, 默认是.05
            当选择derection=’both‘时,移除变量的pvalue阈值。
        direction : str, 默认是'backward'
            逐步回归方向。
        show_step : bool, 默认是True
            是否显示逐步回归过程。
        criterion_enter : float, 默认是None
            当选择derection=’both‘或'forward'时,模型加入变量的相应的criterion阈值。
        criterion_remove : float, 默认是None
            当选择derection='backward'时,模型移除变量的相应的criterion阈值。
        max_iter : int, 默认是200
            模型最大迭代次数。
        '''
        criterion_list = ['bic', 'aic', 'ssr', 'rsquared', 'rsquared_adj']
        if criterion not in criterion_list:
            raise IOError('请输入正确的criterion, 必须是以下内容之一:', '\n', criterion_list)
            
        direction_list = ['backward', 'forward', 'both']
        if direction not in direction_list:
            raise IOError('请输入正确的direction, 必须是以下内容之一:', '\n', direction_list)
            
        # 默认p_enter参数    
        p_enter = {'bic':0.0, 'aic':0.0, 'ssr':0.05, 'rsquared':0.05, 'rsquared_adj':-0.05}
        if criterion_enter:  # 如果函数中对p_remove相应key传参,则变更该参数
            p_enter[criterion] = criterion_enter
            
        # 默认p_remove参数    
        p_remove = {'bic':0.01, 'aic':0.01, 'ssr':0.1, 'rsquared':0.05, 'rsquared_adj':-0.05}
        if criterion_remove:  # 如果函数中对p_remove相应key传参,则变更该参数
            p_remove[criterion] = criterion_remove
            
        if normalize: # 如果需要标准化数据
            intercept = False  # 截距强制设置为0
            df_std = StandardScaler().fit_transform(df)
            df = pd.DataFrame(df_std, columns=df.columns, index=df.index)  
                
        ''' forward '''
        if direction == 'forward':
            remaining = list(df.columns)  # 自变量集合
            remaining.remove(response)
            selected = []  # 初始化选入模型的变量列表
            # 初始化当前评分,最优新评分
            if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, remaining[0])
            else:
                formula = "{} ~ {} - 1".format(response, remaining[0])
                    
            result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合            
            current_score = eval('result.' + criterion)
            best_new_score = eval('result.' + criterion)
                
            if show_step:    
                print('\nstepwise starting:\n')
            iter_times = 0
            # 当变量未剔除完,并且当前评分更新时进行循环
            while remaining and (current_score == best_new_score) and (iter_times<max_iter):
                scores_with_candidates = []  # 初始化变量以及其评分列表
                for candidate in remaining:  # 在未剔除的变量中每次选择一个变量进入模型,如此循环
                    if intercept: # 是否有截距
                        formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                    else:
                        formula = "{} ~ {} - 1".format(response, ' + '.join(selected + [candidate]))
                        
                    result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合
                    fvalue = result.fvalue
                    f_pvalue = result.f_pvalue                    
                    score = eval('result.' + criterion)                    
                    scores_with_candidates.append((score, candidate, fvalue, f_pvalue)) # 记录此次循环的变量、评分列表
                    
                if criterion == 'ssr':  # 这几个指标取最小值进行优化
                    scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                    if ((current_score - best_new_score) > p_enter[criterion]) and (best_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                        remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                        current_score = best_new_score  # 更新当前评分
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                                  (best_candidate, best_new_score, best_new_fvalue, best_new_f_pvalue))
                    elif (current_score - best_new_score) >= 0 and (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                    scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                    if (current_score - best_new_score) > p_enter[criterion]:  # 如果当前评分大于最新评分
                        remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                        current_score = best_new_score  # 更新当前评分
                        #print(iter_times)
                        if show_step:  # 是否显示逐步回归过程  
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (current_score - best_new_score) >= 0 and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                else:
                    scores_with_candidates.sort()
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()
                    if (best_new_score - current_score) > p_enter[criterion]:
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        print(iter_times, flush=True)
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (best_new_score - current_score) >= 0 and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                iter_times += 1

if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            else:
                formula = "{} ~ {} - 1".format(response, ' + '.join(selected))
                
            self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合
            
            if show_step:  # 是否显示逐步回归过程                
                print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
                print('\n', self.stepwise_model.summary())
        
        ''' backward '''
        if direction == 'backward':
            remaining, selected = set(df.columns), set(df.columns)  # 自变量集合
            remaining.remove(response)
            selected.remove(response)  # 初始化选入模型的变量列表
             # 初始化当前评分,最优新评分
            if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            else:
                formula = "{} ~ {} - 1".format(response, ' + '.join(selected))
                    
            result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合            
            current_score = eval('result.' + criterion)
            worst_new_score = eval('result.' + criterion)
                
            if show_step:    
                print('\nstepwise starting:\n')
            iter_times = 0
            # 当变量未剔除完,并且当前评分更新时进行循环
            while remaining and (current_score == worst_new_score) and (iter_times<max_iter):
                scores_with_eliminations = []  # 初始化变量以及其评分列表
                for elimination in remaining:  # 在未剔除的变量中每次选择一个变量进入模型,如此循环
                    if intercept: # 是否有截距
                        formula = "{} ~ {} + 1".format(response, ' + '.join(selected - set(elimination)))
                    else:
                        formula = "{} ~ {} - 1".format(response, ' + '.join(selected - set(elimination)))
                        
                    result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合
                    fvalue = result.fvalue
                    f_pvalue = result.f_pvalue                    
                    score = eval('result.' + criterion)                    
                    scores_with_eliminations.append((score, elimination, fvalue, f_pvalue)) # 记录此次循环的变量、评分列表
                    
                if criterion == 'ssr':  # 这几个指标取最小值进行优化
                    scores_with_eliminations.sort(reverse=False)  # 对评分列表进行降序排序
                    worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()  # 提取最小分数及其对应变量
                    if ((worst_new_score - current_score) < p_remove[criterion]) and (worst_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                        remaining.remove(worst_elimination)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.remove(worst_elimination)  # 从已选变量列表中剔除最新最优分对应的变量
                        current_score = worst_new_score  # 更新当前评分
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Removing %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                                  (worst_elimination, worst_new_score, worst_new_fvalue, worst_new_f_pvalue))
                elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                    scores_with_eliminations.sort(reverse=False)  # 对评分列表进行降序排序
                    worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()  # 提取最小分数及其对应变量
                    if (worst_new_score - current_score) < p_remove[criterion]:  # 如果评分变动不显著
                        remaining.remove(worst_elimination)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.remove(worst_elimination)  # 从已选变量列表中剔除最新最优分对应的变量
                        current_score = worst_new_score  # 更新当前评分
                        if show_step:  # 是否显示逐步回归过程  
                            print('Removing %s, %s = %.3f' % (worst_elimination, criterion, worst_new_score))                        
                else:
                    scores_with_eliminations.sort(reverse=True)
                    worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()
                    if (current_score - worst_new_score) < p_remove[criterion]:
                        remaining.remove(worst_elimination)
                        selected.remove(worst_elimination)
                        current_score = worst_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Removing %s, %s = %.3f' % (worst_elimination, criterion, worst_new_score))                    
                iter_times += 1
                
            if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            else:
                formula = "{} ~ {} - 1".format(response, ' + '.join(selected))
                
            self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合
            
            if show_step:  # 是否显示逐步回归过程                
                print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
                print('\n', self.stepwise_model.summary())
        
        ''' both '''
        if direction == 'both':
            remaining = list(df.columns)  # 自变量集合
            remaining.remove(response)
            selected = []  # 初始化选入模型的变量列表
            # 初始化当前评分,最优新评分
            if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, remaining[0])
            else:
                formula = "{} ~ {} - 1".format(response, remaining[0])
                    
            result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合            
            current_score = eval('result.' + criterion)
            best_new_score = eval('result.' + criterion)
                
            if show_step:    
                print('\nstepwise starting:\n')
            # 当变量未剔除完,并且当前评分更新时进行循环
            iter_times = 0
            while remaining and (current_score == best_new_score) and (iter_times<max_iter):
                scores_with_candidates = []  # 初始化变量以及其评分列表
                for candidate in remaining:  # 在未剔除的变量中每次选择一个变量进入模型,如此循环
                    if intercept: # 是否有截距
                        formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                    else:
                        formula = "{} ~ {} - 1".format(response, ' + '.join(selected + [candidate]))
                        
                    result = smf.ols(formula, df).fit() # 最小二乘法回归模型拟合
                    fvalue = result.fvalue
                    f_pvalue = result.f_pvalue                    
                    score = eval('result.' + criterion)                    
                    scores_with_candidates.append((score, candidate, fvalue, f_pvalue)) # 记录此次循环的变量、评分列表
                    
                if criterion == 'ssr':  # 这几个指标取最小值进行优化
                    scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                    if ((current_score - best_new_score) > p_enter[criterion]) and (best_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                        remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                        current_score = best_new_score  # 更新当前评分
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                                  (best_candidate, best_new_score, best_new_fvalue, best_new_f_pvalue))
                    elif (current_score - best_new_score) >= 0 and (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                    scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                    if (current_score - best_new_score) > p_enter[criterion]:  # 如果当前评分大于最新评分
                        remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                        selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                        current_score = best_new_score  # 更新当前评分
                        if show_step:  # 是否显示逐步回归过程  
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (current_score - best_new_score) >= 0 and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                else:
                    scores_with_candidates.sort()
                    best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()
                    if (best_new_score - current_score) > p_enter[criterion]:  # 当评分差大于p_enter
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif (best_new_score - current_score) >= 0 and iter_times == 0: # 当评分差大于等于0,且为第一次迭代
                        remaining.remove(best_candidate)
                        selected.append(best_candidate)
                        current_score = best_new_score
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                    elif iter_times == 0:  # 当评分差小于p_enter,且为第一次迭代
                        selected.append(remaining[0])
                        remaining.remove(remaining[0])
                        if show_step:  # 是否显示逐步回归过程                             
                            print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
                            
                if intercept: # 是否有截距
                    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
                else:
                    formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

result = smf.ols(formula, df).fit()  # 最优模型拟合                    
                if iter_times >= 1: # 当第二次循环时判断变量的pvalue是否达标
                    if result.pvalues.max() > p_value_enter:
                        var_removed = result.pvalues[result.pvalues == result.pvalues.max()].index[0]
                        p_value_removed = result.pvalues[result.pvalues == result.pvalues.max()].values[0]
                        selected.remove(result.pvalues[result.pvalues == result.pvalues.max()].index[0])
                        if show_step:  # 是否显示逐步回归过程                
                            print('Removing %s, Pvalue = %.3f' % (var_removed, p_value_removed))
                iter_times += 1
                
            if intercept: # 是否有截距
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            else:
                formula = "{} ~ {} - 1".format(response, ' + '.join(selected))
                
            self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合           
            if show_step:  # 是否显示逐步回归过程                
                print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
                print('\n', self.stepwise_model.summary())                
        # 最终模型选择的变量
        if intercept:
            self.stepwise_feat_selected_ = list(self.stepwise_model.params.index[1:])
        else:
            self.stepwise_feat_selected_ = list(self.stepwise_model.params.index)
        return self

回归分析特征选择(包括Stepwise算法) python 实现的更多相关文章

  1. pageRank算法 python实现

    一.什么是pagerank PageRank的Page可是认为是网页,表示网页排名,也可以认为是Larry Page(google 产品经理),因为他是这个算法的发明者之一,还是google CEO( ...

  2. 常见排序算法-Python实现

    常见排序算法-Python实现 python 排序 算法 1.二分法     python    32行 right = length-  :  ]   ):  test_list = [,,,,,, ...

  3. kmp算法python实现

    kmp算法python实现 kmp算法 kmp算法用于字符串的模式匹配,也就是找到模式字符串在目标字符串的第一次出现的位置比如abababc那么bab在其位置1处,bc在其位置5处我们首先想到的最简单 ...

  4. KMP算法-Python版

                               KMP算法-Python版 传统法: 从左到右一个个匹配,如果这个过程中有某个字符不匹配,就跳回去,将模式串向右移动一位.这有什么难的? 我们可以 ...

  5. 压缩感知重构算法之IRLS算法python实现

    压缩感知重构算法之OMP算法python实现 压缩感知重构算法之CoSaMP算法python实现 压缩感知重构算法之SP算法python实现 压缩感知重构算法之IHT算法python实现 压缩感知重构 ...

  6. 压缩感知重构算法之OLS算法python实现

    压缩感知重构算法之OMP算法python实现 压缩感知重构算法之CoSaMP算法python实现 压缩感知重构算法之SP算法python实现 压缩感知重构算法之IHT算法python实现 压缩感知重构 ...

  7. 压缩感知重构算法之CoSaMP算法python实现

    压缩感知重构算法之OMP算法python实现 压缩感知重构算法之CoSaMP算法python实现 压缩感知重构算法之SP算法python实现 压缩感知重构算法之IHT算法python实现 压缩感知重构 ...

  8. 压缩感知重构算法之IHT算法python实现

    压缩感知重构算法之OMP算法python实现 压缩感知重构算法之CoSaMP算法python实现 压缩感知重构算法之SP算法python实现 压缩感知重构算法之IHT算法python实现 压缩感知重构 ...

  9. 压缩感知重构算法之SP算法python实现

    压缩感知重构算法之OMP算法python实现 压缩感知重构算法之CoSaMP算法python实现 压缩感知重构算法之SP算法python实现 压缩感知重构算法之IHT算法python实现 压缩感知重构 ...

随机推荐

  1. RabbitMQ官方教程一Hello World(GOLANG语言实现)

    介绍 RabbitMQ是消息中间件:它接受并转发消息. 您可以将其视为邮局系统:将要发送的邮件放在邮箱中时, 可以确保邮递员最终将邮件传递给收件人. 以此类推,RabbitMQ是一个邮箱,一个邮局和一 ...

  2. input回退禁止

    //处理键盘事件 禁止后退键(Backspace)密码或单行.多行文本框除外 function forbidBackSpace(e) { var ev = e || window.event; //获 ...

  3. WDL学习笔记

    WDL是什么? WDL是由Broad Institute开发的一种流程开发语言,全称workflow description language,官方链接如下:https://software.broa ...

  4. qt QML弹出新页面之后,如何屏蔽上一个页面的按钮区域事件

    Rectangle{ //Rectangle是要显示的新页面 //增加一个mouseArea:,必须好把MouseArea作为第一个子元素,如果放在最后且不设置z属性的话,会覆盖其//他控件 Mous ...

  5. LeetCode 589. N叉树的前序遍历(N-ary Tree Preorder Traversal)

    589. N叉树的前序遍历 589. N-ary Tree Preorder Traversal LeetCode589. N-ary Tree Preorder Traversal 题目描述 给定一 ...

  6. 初始NLTK

    NLTK官网:链接 Natural Language Toolkit NLTK corpora and lexical resources such as WordNet, along with a ...

  7. ASP.NET Core依赖注入多个服务实现类

    依赖注入在 ASP.NET Core 中起中很重要的作用,也是一种高大上的编程思想,它的总体原则就是:俺要啥,你就给俺送啥过来. 服务类型的实例转由容器自动管理,无需我们在代码中显式处理. 因此,有了 ...

  8. mysql全面整理(用于复习、查阅)--正在更新

    Mysql学习 1. 关键字与函数名称全部大写 2. 数据库名称.表名称.字段名称全部小写 3. SQL语句必须以分号结尾 一.数据库基本操作 1. 创建.查看数据库 CREATE {DATABASE ...

  9. webpack css文件编译、自动添加前缀、剥离

    1.css文件编译 webpack默认只能编译js文件,引入css需要loader支持 // css文件写入js中 npm i style-loader -D // css文件loader npm i ...

  10. jenkins pipeline中获取shell命令的标准输出或者状态

    //获取标准输出//第一种 result = sh returnStdout: true ,script: "<shell command>" result = res ...