# -*- coding: utf-8 -*- """ scipy 패키지 선형 회귀분석 """ from scipy import stats #선형 회귀분석 모듈 import pandas as pd score_df=pd.read_csv("../data/score_iq.csv") print(score_df.info()) #150x6 """ RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): sid 150 non-null int64 score 150 non-null int64 iq 150 non-null int64 academy 150 non-null int64 game 150 non-null int64 tv 150 non-null int64 dtypes: int64(6) """ print(score_df.head()) """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ #1)단순 선현회귀분석 #독립변수 (x:1) -> 종속변수(y:1) #변수 모델링 x=score_df.iq #score_df['iq'] y=score_df.score # #score_df['score'] #단순 선형 회귀모형 model=stats.linregress(x,y) #모델 결과 print('model=',model) """ model= LinregressResult( slope=0.6514309527270075, ->기울기 intercept=-2.8564471221974657, ->절편 rvalue=0.8822203446134699, ->설명력 1=100% 1에 가까우면 좋다 pvalue=2.8476895206683644e-50, ->모델 유의성(0.05보다 크면 의미 없다) stderr=0.028577934409305443)->표준오차 """ #회귀방정식 =1차 함수 #Y =aX+b (a:기울기 ,b:절편) #score:90 iq:140 Y=model.slope*140-model.intercept print("점수 예측치=",Y) #점수 예측치= 88.34388625958358 err=90-Y print("모델 오차=",err)#모델 오차= 1.6561137404164157 print('x 기울기=',model.slope)#x 기울기= 0.6514309527270075 print('x 절편=',model.intercept)#x 절편= -2.8564471221974657 print('x 설명력=',model.rvalue)#x 설명력= 0.8822203446134699 print('x 유의성=',model.pvalue)#x 유의성= 2.8476895206683644e-50 print('x 표준오차=',model.stderr)#x 표준오차= 0.028577934409305443 #2)다중 선형 회귀모형 # -독립 변수 (X) 2개이상 import statsmodels.formula.api as sm corr=score_df.corr() print("상관 계수 행렬") print(corr) """ sid score iq academy game tv sid 1.000000 -0.014399 -0.007048 -0.004398 0.018806 0.024565 score -0.014399 1.000000 0.882220 0.896265 -0.298193 -0.819752 iq -0.007048 0.882220 1.000000 0.671783 -0.031516 -0.585033 academy -0.004398 0.896265 0.671783 1.000000 -0.351315 -0.948551 game 0.018806 -0.298193 -0.031516 -0.351315 1.000000 0.239217 tv 0.024565 -0.819752 -0.585033 -0.948551 0.239217 1.000000 """ #변수 모델 :X(iq,academy )->y(score) model = sm.ols(formula="score ~ iq + academy", data=score_df).fit() print("model",model) #object info #model <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000000000CEAC588> #모델의 파라메터: 기울기 절편 print(model.params) """ Intercept 25.229141-> 절편 iq 0.376966 ->X1 기울기 academy 2.992800 ->X2 기울기 dtype: float64 """ #다중 선형 회귀 방정식 print(score_df.head()) """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ Y=0.376966*140+2.992800*2+25.229141 print("예측치=",Y)#예측치= 83.989981 #모델 결과 print(model.summary()) """ OLS Regression Results ============================================================================== Dep. Variable: score R-squared: 0.946 Model: OLS Adj. R-squared: 0.946 Method: Least Squares F-statistic: 1295. Date: Sat, 16 Feb 2019 Prob (F-statistic): 4.50e-94 Time: 11:23:48 Log-Likelihood: -275.05 No. Observations: 150 AIC: 556.1 Df Residuals: 147 BIC: 565.1 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 25.2291 2.187 11.537 0.000 20.907 29.551 iq 0.3770 0.019 19.786 0.000 0.339 0.415 academy 2.9928 0.140 21.444 0.000 2.717 3.269 ============================================================================== Omnibus: 36.342 Durbin-Watson: 1.913 Prob(Omnibus): 0.000 Jarque-Bera (JB): 54.697 Skew: 1.286 Prob(JB): 1.33e-12 Kurtosis: 4.461 Cond. No. 2.18e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.18e+03. This might indicate that there are strong multicollinearity or other numerical problems. """ """ 1.Prob (F-statistic): 4.50e-94:유의성 (0.05미만이여야 한다) 2.Adj. R-squared: 0.946:설명력 (1에 가까와야 좋다) 3.P>|t| :X 유의성 검정: 0.05미만예야 좋타 """
# -*- coding: utf-8 -*- """ 회귀모형 예측에 행렬곱(dot) 적용예 """ import pandas as pd import numpy as np #1.data set 가져오기 score_df=pd.read_csv("../data/score_iq.csv") print(score_df.head())# 6칼럼 """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ #2.subset 생성 score_arr=score_df[['score','iq','academy']]#3칼럼 print(score_arr.shape)#(150, 3) print(score_arr.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 3 columns): score 150 non-null int64 iq 150 non-null int64 academy 150 non-null int64 dtypes: int64(3) memory usage: 3.6 KB None """ #3.X,y변수 선택 score_X=score_arr.ix[:,1:] #2개 (150x2) 2차원 score_y=score_arr.ix[:,0]#1개(150) 1차원 print(score_X.shape) #(150, 2) print(score_y.shape) #(150,) #4.기울기 ,절편 """ Intercept 25.229141-> 절편 iq 0.376966 -> X1 기울기 academy 2.992800 -> X2 기울기 dtype: float64 """ #기울기 변수 slop=np.array([[0.376966],[2.992800]]) #2차원 Intercept=25.229141 #상수 0차원 #Y=(a1*x1+a2*x2)+b #(a1*x1+a2*x2)->행렵곱 #5.행렬곱(dot) 적용 print(score_X.shape) #(150, 2) print(slop.shape) #(2, 1) #(150, 2) * (2, 1) =(150,1) matmul = np.dot(score_X,slop) Y = matmul + Intercept print(Y) """ [[83.989981] [75.342691] ... [73.457861]] """ #6. model 평가 (정답 vs 예측치) #Y = 예측치 #score_y #정답 print(Y.shape) #(150, 1) 2차원 ->1차원 print(score_y.shape) #(150,) 1차원 #2차원 ->1차원 Y_fitted=Y.reshape(150) # (150,) df=pd.DataFrame({"Y_fitted":Y_fitted,'score':score_y}) print(df) # (150, 2) #상관 분석 print(df.head()) """ Y_fitted score 0 83.989981 90 1 75.342691 75 2 73.457861 77 3 82.105151 83 4 64.810571 65 """ cor=df.Y_fitted.corr(df.score) print('corr=',cor) #corr= 0.9727792069594755
# -*- coding: utf-8 -*- """ sklearn 제공 datasets """ from sklearn import datasets import numpy as np #1.선형회귀분석 적합 데이터셋 #1) iris (붖꽃) iris=datasets.load_iris() print(iris) iris_x=iris.data #x iris_y=iris.target #y print(type(iris_x)) #<class 'numpy.ndarray'> print(np.shape(iris_x)) #(150, 4) print(np.shape(iris_y)) #(150,) print(iris_x) """ [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2]] """ print(iris_y) """ [0 0 ... 0 0] """ #y범주 print(list(iris.target_names)) #['setosa'=0, 'versicolor'=1, 'virginica'=2] #2)당뇨병 데이터셋 diabetes=datasets.load_diabetes() diabetes_x=diabetes.data # x diabetes_y=diabetes.target # y print(diabetes_x.shape) #(442, 10) print(diabetes_y.shape) #(442,) print(diabetes_y) #3)보스톤 데이터셋 boston=datasets.load_boston() boston_x=boston.data boston_y=boston.target print(boston_x.shape)#(506, 13) print(boston_y.shape)#(506,) print(boston.feature_names) #['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT'] #2. 분류분석에 적합한 데이터셋 #4) wine 데이터셋 다항분류 (softmax 함수) #'class_0:0.98,+class_1:0.01,+class_2:0.01=1 wine= datasets.load_wine() wine_x=wine.data #(442, 10) wine_y=wine.target #(442,) print(wine.target_names) #['class_0' 'class_1' 'class_2'] print(wine_x.shape)#(178, 13) print(wine_y) """ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] """ #5) 이진분류 (sigmoid 함수) # YES 0.5> ,NO 0.5 < breast=datasets.load_breast_cancer() print(breast.data.shape) #(569, 30) print(breast.target.shape)#(569,) print(breast.target_names) #['malignant' 'benign'] print(breast)
# -*- coding: utf-8 -*- """ sklearn 관련 Regressin모델 - y변수가 연속인 경우 """ import pandas as pd from sklearn import datasets from sklearn.linear_model import LinearRegression #model from sklearn.model_selection import train_test_split #train set VS test set from sklearn.metrics import mean_squared_error #MES (평균제곱 오차) # 1. dataset 가져오기 iris=pd.read_csv("../data/iris.csv") print(iris.info()) """ RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): Sepal.Length 150 non-null float64 Sepal.Width 150 non-null float64 Petal.Length 150 non-null float64 Petal.Width 150 non-null float64 Species 150 non-null object dtypes: float64(4), object(1) """ print(iris.head()) """ Sepal.Length Sepal.Width Petal.Length Petal.Width Species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ #2. 변수(x,y) 선택 cols=list(iris.columns) print(cols) #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'] x_cols = cols[1:4] #'Sepal.Width', 'Petal.Length', 'Petal.Width' y_cols = cols[0] #'Sepal.Length' #subset data_df=iris[cols[:4]] #1~4칼럼 print(data_df.shape)#(150, 4) #3 train set(70%)/test set(30%) #자동 랜덤 ,random_state=123똑같은 랜덤 iris_train,iris_test=train_test_split( data_df,test_size=0.3,random_state=123) print(iris_train.shape)#(105, 4) model 생성 print(iris_test.shape) #(45, 4) model 검정 #4.model 생성 #help(LinearRegression) #class-> object lr_model=LinearRegression()#default model객체 #fit(train_x,train_y) :학습->model lr_model.fit(iris_train[x_cols],iris_train[y_cols]) #train set #획귀 계수(기울기),절편 print("기울기=",lr_model.coef_)#기울기= [ 0.63924286 0.75744562 -0.68796484] print("절편=",lr_model.intercept_)#절편= 1.8609363992411732 #5. 모델 평가 :test 예측치 =회귀방정식 #1)train set model_socre1=lr_model.score(iris_train[x_cols], iris_train[y_cols]) #2)test set model_socre2=lr_model.score(iris_test[x_cols], iris_test[y_cols]) #1.socre print('train_model score=',model_socre1)#train_model score= 0.8581515699458577 print('test_model score=',model_socre2)#test_model score= 0.854680765745176 #model 예측치 vs 정답 pred=lr_model.predict(iris_test[x_cols])# 예측치 predict(x) Y=iris_test[y_cols]#정답 #2.평균제곱오차 (MSE) MSE=mean_squared_error(Y,pred) #(정답,예측치) print('MSE=',MSE)#MSE= 0.11633863200224713 ###################### ### load_iris() ###################### from sklearn.datasets import load_iris #1. data loading iris=load_iris() # 2. 변수 선택 X=iris.data # x y=iris.target #y(0~2) print(X.shape)#(150, 4) print(y.shape)#(150,) # 3. train /test split(7:3) x_train,x_test,y_train,y_test=train_test_split( X,y, test_size=0.3,random_state=123) print(x_train.shape)#(105, 4) - 1~4번째 print(x_test.shape)#(45, 4) print(y_train.shape)#(105,) - 5번째 print(y_test.shape)#(45,) #4.model 생성:tran set lr_model2=LinearRegression() lr_model2.fit(x_train,y_train) # train -> model print(lr_model2.coef_) #기울기 [-0.12591445 -0.0481559 0.24484363 0.57025678] print(lr_model2.intercept_) #절편 0.2537496076784179 #5. model평가 :test set #1) score model_score=lr_model2.score(x_test,y_test) print(model_score) #0.9427868501294299 #2) Mes(예측치 vs 정답) pred=lr_model2.predict(x_test) Y=y_test MSE=mean_squared_error(pred,Y) print('MSE=',MSE)#MSE= 0.04447086315865546 #E=pred-Y #sqared=E^2 import numpy as np mes=np.mean((pred-Y)**2) print('MSE=',MSE) #MSE= 0.04447086315865546 #3시각화 평가 import matplotlib.pyplot as plt fig=plt.figure(figsize=(20,5)) chart=fig.add_subplot(1,1,1) chart.plot(pred,color='r',label="pred") chart.plot(Y,color='b',label="y") plt.legend(loc='best') plt.show()
# -*- coding: utf-8 -*- """ sklearn logistic Regreesion - y변수가 범주인 경우 """ from sklearn.datasets import load_iris #다항분류 from sklearn.datasets import load_breast_cancer #이항분류 from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt import pandas as np ##################################### ## 1. load_breast_cancer : 이항분류 ##################################### #1.loading data breast=load_breast_cancer() # 2. 변수 선택 X=breast.data y=breast.target print(X.shape,y.shape)#(569, 30) (569,) # 3.model 생성 #help(LogisticRegression) #1.random_state : 난수 seed값 #2.solver :최적화 알고리즘 # {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'} default: 'liblinear' # 작은 데이터셋:'liblinear' # 큰 데이터셋:'sag', 'saga' # 멀티 클래스 문제:'newton-cg','lbfgs' # 다항붕류 'multinomal' #적용 예) #1.일반 데이터셋 ,이항분류 :default #2일반 데이터셋 ,다항분류 :solver='lbfgs',multi_class="multinomial" #3.빅 데이터셋 ,이항분류 :solver='sag' #object lr_model=LogisticRegression(random_state=0) lr_model.fit(X,y) #model 생성 #예측치 predict pred=lr_model.predict(X) print('prdict=',pred[:5])#prdict= [0 0 0 1 0] print('y정답=',y[:5])#y정답= [0 0 0 0 0] # model 평가 : score = 분류정확도(accuracy) score=lr_model.score(X,y) print(score) #0.9595782073813708 #:교차 분할표(confusing matrix) tab=pd.crosstab(y,pred) #crosstab(row:정답,col:예측치) print(tab) """ col_0 0 1 row_0 0 198 14 1 9 348 """ acc=(198+348)/len(y) print('accuracy=',acc)#accuracy= 0.9595782073813708 ################################# ## 2. load_irsi : 다항분류 ################################# #1.data loading X,y=load_iris(return_X_y=True) #2.model 생성 lr_model2=LogisticRegression(random_state=123, solver='lbfgs', multi_class="multinomial") lr_model2.fit(X,y) print(lr_model2) #model 정보 """ LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='multinomial', n_jobs=1, penalty='l2', random_state=123, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False) """ # 예측치 pred=lr_model2.predict(X) #예측치 Y=y #정답 score=lr_model2.score(X,y) print('accuracy=',score)#accuracy= 0.9733333333333334 tab=pd.crosstab(Y,pred) print(tab) """ col_0 0 1 2 row_0 0 50 0 0 1 0 47 3 2 0 1 49 """ print(type(tab))#<class 'pandas.core.frame.DataFrame'> acc=(tab.ix[0,0]+tab.ix[1,1]+tab.ix[2,2])/len(y) print('accuracy=',acc) #accuracy= 0.9733333333333334 # 분류정확도(accuracy) 시각화 import seaborn as sn # heatmap - Accuracy Score # confusion matrix heatmap plt.figure(figsize=(6,6)) # chart size sn.heatmap(tab, annot=True, fmt=".3f", linewidths=.5, square = True);# , cmap = 'Blues_r' : map »ö»ó plt.ylabel('Actual label'); plt.xlabel('Predicted label'); all_sample_title = 'Accuracy Score: {0}'.format(score) plt.title(all_sample_title, size = 18) plt.show()
