03.Regression
01.regression
# -*- coding: utf-8 -*- """ scipy 패키지 선형 회귀분석 """ from scipy import stats #선형 회귀분석 모듈 import pandas as pd score_df=pd.read_csv("../data/score_iq.csv") print(score_df.info()) #150x6 """ RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): sid 150 non-null int64 score 150 non-null int64 iq 150 non-null int64 academy 150 non-null int64 game 150 non-null int64 tv 150 non-null int64 dtypes: int64(6) """ print(score_df.head()) """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ #1)단순 선현회귀분석 #독립변수 (x:1) -> 종속변수(y:1) #변수 모델링 x=score_df.iq #score_df['iq'] y=score_df.score # #score_df['score'] #단순 선형 회귀모형 model=stats.linregress(x,y) #모델 결과 print('model=',model) """ model= LinregressResult( slope=0.6514309527270075, ->기울기 intercept=-2.8564471221974657, ->절편 rvalue=0.8822203446134699, ->설명력 1=100% 1에 가까우면 좋다 pvalue=2.8476895206683644e-50, ->모델 유의성(0.05보다 크면 의미 없다) stderr=0.028577934409305443)->표준오차 """ #회귀방정식 =1차 함수 #Y =aX+b (a:기울기 ,b:절편) #score:90 iq:140 Y=model.slope*140-model.intercept print("점수 예측치=",Y) #점수 예측치= 88.34388625958358 err=90-Y print("모델 오차=",err)#모델 오차= 1.6561137404164157 print('x 기울기=',model.slope)#x 기울기= 0.6514309527270075 print('x 절편=',model.intercept)#x 절편= -2.8564471221974657 print('x 설명력=',model.rvalue)#x 설명력= 0.8822203446134699 print('x 유의성=',model.pvalue)#x 유의성= 2.8476895206683644e-50 print('x 표준오차=',model.stderr)#x 표준오차= 0.028577934409305443 #2)다중 선형 회귀모형 # -독립 변수 (X) 2개이상 import statsmodels.formula.api as sm corr=score_df.corr() print("상관 계수 행렬") print(corr) """ sid score iq academy game tv sid 1.000000 -0.014399 -0.007048 -0.004398 0.018806 0.024565 score -0.014399 1.000000 0.882220 0.896265 -0.298193 -0.819752 iq -0.007048 0.882220 1.000000 0.671783 -0.031516 -0.585033 academy -0.004398 0.896265 0.671783 1.000000 -0.351315 -0.948551 game 0.018806 -0.298193 -0.031516 -0.351315 1.000000 0.239217 tv 0.024565 -0.819752 -0.585033 -0.948551 0.239217 1.000000 """ #변수 모델 :X(iq,academy )->y(score) model = sm.ols(formula="score ~ iq + academy", data=score_df).fit() print("model",model) #object info #model <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000000000CEAC588> #모델의 파라메터: 기울기 절편 print(model.params) """ Intercept 25.229141-> 절편 iq 0.376966 ->X1 기울기 academy 2.992800 ->X2 기울기 dtype: float64 """ #다중 선형 회귀 방정식 print(score_df.head()) """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ Y=0.376966*140+2.992800*2+25.229141 print("예측치=",Y)#예측치= 83.989981 #모델 결과 print(model.summary()) """ OLS Regression Results ============================================================================== Dep. Variable: score R-squared: 0.946 Model: OLS Adj. R-squared: 0.946 Method: Least Squares F-statistic: 1295. Date: Sat, 16 Feb 2019 Prob (F-statistic): 4.50e-94 Time: 11:23:48 Log-Likelihood: -275.05 No. Observations: 150 AIC: 556.1 Df Residuals: 147 BIC: 565.1 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 25.2291 2.187 11.537 0.000 20.907 29.551 iq 0.3770 0.019 19.786 0.000 0.339 0.415 academy 2.9928 0.140 21.444 0.000 2.717 3.269 ============================================================================== Omnibus: 36.342 Durbin-Watson: 1.913 Prob(Omnibus): 0.000 Jarque-Bera (JB): 54.697 Skew: 1.286 Prob(JB): 1.33e-12 Kurtosis: 4.461 Cond. No. 2.18e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.18e+03. This might indicate that there are strong multicollinearity or other numerical problems. """ """ 1.Prob (F-statistic): 4.50e-94:유의성 (0.05미만이여야 한다) 2.Adj. R-squared: 0.946:설명력 (1에 가까와야 좋다) 3.P>|t| :X 유의성 검정: 0.05미만예야 좋타 """
02.dot_regression
# -*- coding: utf-8 -*- """ 회귀모형 예측에 행렬곱(dot) 적용예 """ import pandas as pd import numpy as np #1.data set 가져오기 score_df=pd.read_csv("../data/score_iq.csv") print(score_df.head())# 6칼럼 """ sid score iq academy game tv 0 10001 90 140 2 1 0 1 10002 75 125 1 3 3 2 10003 77 120 1 0 4 3 10004 83 135 2 3 2 4 10005 65 105 0 4 4 """ #2.subset 생성 score_arr=score_df[['score','iq','academy']]#3칼럼 print(score_arr.shape)#(150, 3) print(score_arr.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 3 columns): score 150 non-null int64 iq 150 non-null int64 academy 150 non-null int64 dtypes: int64(3) memory usage: 3.6 KB None """ #3.X,y변수 선택 score_X=score_arr.ix[:,1:] #2개 (150x2) 2차원 score_y=score_arr.ix[:,0]#1개(150) 1차원 print(score_X.shape) #(150, 2) print(score_y.shape) #(150,) #4.기울기 ,절편 """ Intercept 25.229141-> 절편 iq 0.376966 -> X1 기울기 academy 2.992800 -> X2 기울기 dtype: float64 """ #기울기 변수 slop=np.array([[0.376966],[2.992800]]) #2차원 Intercept=25.229141 #상수 0차원 #Y=(a1*x1+a2*x2)+b #(a1*x1+a2*x2)->행렵곱 #5.행렬곱(dot) 적용 print(score_X.shape) #(150, 2) print(slop.shape) #(2, 1) #(150, 2) * (2, 1) =(150,1) matmul = np.dot(score_X,slop) Y = matmul + Intercept print(Y) """ [[83.989981] [75.342691] ... [73.457861]] """ #6. model 평가 (정답 vs 예측치) #Y = 예측치 #score_y #정답 print(Y.shape) #(150, 1) 2차원 ->1차원 print(score_y.shape) #(150,) 1차원 #2차원 ->1차원 Y_fitted=Y.reshape(150) # (150,) df=pd.DataFrame({"Y_fitted":Y_fitted,'score':score_y}) print(df) # (150, 2) #상관 분석 print(df.head()) """ Y_fitted score 0 83.989981 90 1 75.342691 75 2 73.457861 77 3 82.105151 83 4 64.810571 65 """ cor=df.Y_fitted.corr(df.score) print('corr=',cor) #corr= 0.9727792069594755
03.sklearn_Dataset
# -*- coding: utf-8 -*- """ sklearn 제공 datasets """ from sklearn import datasets import numpy as np #1.선형회귀분석 적합 데이터셋 #1) iris (붖꽃) iris=datasets.load_iris() print(iris) iris_x=iris.data #x iris_y=iris.target #y print(type(iris_x)) #<class 'numpy.ndarray'> print(np.shape(iris_x)) #(150, 4) print(np.shape(iris_y)) #(150,) print(iris_x) """ [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2]] """ print(iris_y) """ [0 0 ... 0 0] """ #y범주 print(list(iris.target_names)) #['setosa'=0, 'versicolor'=1, 'virginica'=2] #2)당뇨병 데이터셋 diabetes=datasets.load_diabetes() diabetes_x=diabetes.data # x diabetes_y=diabetes.target # y print(diabetes_x.shape) #(442, 10) print(diabetes_y.shape) #(442,) print(diabetes_y) #3)보스톤 데이터셋 boston=datasets.load_boston() boston_x=boston.data boston_y=boston.target print(boston_x.shape)#(506, 13) print(boston_y.shape)#(506,) print(boston.feature_names) #['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT'] #2. 분류분석에 적합한 데이터셋 #4) wine 데이터셋 다항분류 (softmax 함수) #'class_0:0.98,+class_1:0.01,+class_2:0.01=1 wine= datasets.load_wine() wine_x=wine.data #(442, 10) wine_y=wine.target #(442,) print(wine.target_names) #['class_0' 'class_1' 'class_2'] print(wine_x.shape)#(178, 13) print(wine_y) """ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] """ #5) 이진분류 (sigmoid 함수) # YES 0.5> ,NO 0.5 < breast=datasets.load_breast_cancer() print(breast.data.shape) #(569, 30) print(breast.target.shape)#(569,) print(breast.target_names) #['malignant' 'benign'] print(breast)
04.sklearn_Regression
# -*- coding: utf-8 -*- """ sklearn 관련 Regressin모델 - y변수가 연속인 경우 """ import pandas as pd from sklearn import datasets from sklearn.linear_model import LinearRegression #model from sklearn.model_selection import train_test_split #train set VS test set from sklearn.metrics import mean_squared_error #MES (평균제곱 오차) # 1. dataset 가져오기 iris=pd.read_csv("../data/iris.csv") print(iris.info()) """ RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): Sepal.Length 150 non-null float64 Sepal.Width 150 non-null float64 Petal.Length 150 non-null float64 Petal.Width 150 non-null float64 Species 150 non-null object dtypes: float64(4), object(1) """ print(iris.head()) """ Sepal.Length Sepal.Width Petal.Length Petal.Width Species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ #2. 변수(x,y) 선택 cols=list(iris.columns) print(cols) #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'] x_cols = cols[1:4] #'Sepal.Width', 'Petal.Length', 'Petal.Width' y_cols = cols[0] #'Sepal.Length' #subset data_df=iris[cols[:4]] #1~4칼럼 print(data_df.shape)#(150, 4) #3 train set(70%)/test set(30%) #자동 랜덤 ,random_state=123똑같은 랜덤 iris_train,iris_test=train_test_split( data_df,test_size=0.3,random_state=123) print(iris_train.shape)#(105, 4) model 생성 print(iris_test.shape) #(45, 4) model 검정 #4.model 생성 #help(LinearRegression) #class-> object lr_model=LinearRegression()#default model객체 #fit(train_x,train_y) :학습->model lr_model.fit(iris_train[x_cols],iris_train[y_cols]) #train set #획귀 계수(기울기),절편 print("기울기=",lr_model.coef_)#기울기= [ 0.63924286 0.75744562 -0.68796484] print("절편=",lr_model.intercept_)#절편= 1.8609363992411732 #5. 모델 평가 :test 예측치 =회귀방정식 #1)train set model_socre1=lr_model.score(iris_train[x_cols], iris_train[y_cols]) #2)test set model_socre2=lr_model.score(iris_test[x_cols], iris_test[y_cols]) #1.socre print('train_model score=',model_socre1)#train_model score= 0.8581515699458577 print('test_model score=',model_socre2)#test_model score= 0.854680765745176 #model 예측치 vs 정답 pred=lr_model.predict(iris_test[x_cols])# 예측치 predict(x) Y=iris_test[y_cols]#정답 #2.평균제곱오차 (MSE) MSE=mean_squared_error(Y,pred) #(정답,예측치) print('MSE=',MSE)#MSE= 0.11633863200224713 ###################### ### load_iris() ###################### from sklearn.datasets import load_iris #1. data loading iris=load_iris() # 2. 변수 선택 X=iris.data # x y=iris.target #y(0~2) print(X.shape)#(150, 4) print(y.shape)#(150,) # 3. train /test split(7:3) x_train,x_test,y_train,y_test=train_test_split( X,y, test_size=0.3,random_state=123) print(x_train.shape)#(105, 4) - 1~4번째 print(x_test.shape)#(45, 4) print(y_train.shape)#(105,) - 5번째 print(y_test.shape)#(45,) #4.model 생성:tran set lr_model2=LinearRegression() lr_model2.fit(x_train,y_train) # train -> model print(lr_model2.coef_) #기울기 [-0.12591445 -0.0481559 0.24484363 0.57025678] print(lr_model2.intercept_) #절편 0.2537496076784179 #5. model평가 :test set #1) score model_score=lr_model2.score(x_test,y_test) print(model_score) #0.9427868501294299 #2) Mes(예측치 vs 정답) pred=lr_model2.predict(x_test) Y=y_test MSE=mean_squared_error(pred,Y) print('MSE=',MSE)#MSE= 0.04447086315865546 #E=pred-Y #sqared=E^2 import numpy as np mes=np.mean((pred-Y)**2) print('MSE=',MSE) #MSE= 0.04447086315865546 #3시각화 평가 import matplotlib.pyplot as plt fig=plt.figure(figsize=(20,5)) chart=fig.add_subplot(1,1,1) chart.plot(pred,color='r',label="pred") chart.plot(Y,color='b',label="y") plt.legend(loc='best') plt.show()
05.LogisticRegression
# -*- coding: utf-8 -*- """ sklearn logistic Regreesion - y변수가 범주인 경우 """ from sklearn.datasets import load_iris #다항분류 from sklearn.datasets import load_breast_cancer #이항분류 from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt import pandas as np ##################################### ## 1. load_breast_cancer : 이항분류 ##################################### #1.loading data breast=load_breast_cancer() # 2. 변수 선택 X=breast.data y=breast.target print(X.shape,y.shape)#(569, 30) (569,) # 3.model 생성 #help(LogisticRegression) #1.random_state : 난수 seed값 #2.solver :최적화 알고리즘 # {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'} default: 'liblinear' # 작은 데이터셋:'liblinear' # 큰 데이터셋:'sag', 'saga' # 멀티 클래스 문제:'newton-cg','lbfgs' # 다항붕류 'multinomal' #적용 예) #1.일반 데이터셋 ,이항분류 :default #2일반 데이터셋 ,다항분류 :solver='lbfgs',multi_class="multinomial" #3.빅 데이터셋 ,이항분류 :solver='sag' #object lr_model=LogisticRegression(random_state=0) lr_model.fit(X,y) #model 생성 #예측치 predict pred=lr_model.predict(X) print('prdict=',pred[:5])#prdict= [0 0 0 1 0] print('y정답=',y[:5])#y정답= [0 0 0 0 0] # model 평가 : score = 분류정확도(accuracy) score=lr_model.score(X,y) print(score) #0.9595782073813708 #:교차 분할표(confusing matrix) tab=pd.crosstab(y,pred) #crosstab(row:정답,col:예측치) print(tab) """ col_0 0 1 row_0 0 198 14 1 9 348 """ acc=(198+348)/len(y) print('accuracy=',acc)#accuracy= 0.9595782073813708 ################################# ## 2. load_irsi : 다항분류 ################################# #1.data loading X,y=load_iris(return_X_y=True) #2.model 생성 lr_model2=LogisticRegression(random_state=123, solver='lbfgs', multi_class="multinomial") lr_model2.fit(X,y) print(lr_model2) #model 정보 """ LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='multinomial', n_jobs=1, penalty='l2', random_state=123, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False) """ # 예측치 pred=lr_model2.predict(X) #예측치 Y=y #정답 score=lr_model2.score(X,y) print('accuracy=',score)#accuracy= 0.9733333333333334 tab=pd.crosstab(Y,pred) print(tab) """ col_0 0 1 2 row_0 0 50 0 0 1 0 47 3 2 0 1 49 """ print(type(tab))#<class 'pandas.core.frame.DataFrame'> acc=(tab.ix[0,0]+tab.ix[1,1]+tab.ix[2,2])/len(y) print('accuracy=',acc) #accuracy= 0.9733333333333334 # 분류정확도(accuracy) 시각화 import seaborn as sn # heatmap - Accuracy Score # confusion matrix heatmap plt.figure(figsize=(6,6)) # chart size sn.heatmap(tab, annot=True, fmt=".3f", linewidths=.5, square = True);# , cmap = 'Blues_r' : map »ö»ó plt.ylabel('Actual label'); plt.xlabel('Predicted label'); all_sample_title = 'Accuracy Score: {0}'.format(score) plt.title(all_sample_title, size = 18) plt.show()
03.Regression的更多相关文章
- 线性回归 Linear Regression
成本函数(cost function)也叫损失函数(loss function),用来定义模型与观测值的误差.模型预测的价格与训练集数据的差异称为残差(residuals)或训练误差(test err ...
- Multivariance Linear Regression练习
%% 方法一:梯度下降法 x = load('E:\workstation\data\ex3x.dat'); y = load('E:\workstation\data\ex3y.dat'); x = ...
- SparkMLlib之 logistic regression源码分析
最近在研究机器学习,使用的工具是spark,本文是针对spar最新的源码Spark1.6.0的MLlib中的logistic regression, linear regression进行源码分析,其 ...
- Linear regression with multiple variables(多特征的线型回归)算法实例_梯度下降解法(Gradient DesentMulti)以及正规方程解法(Normal Equation)
,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, , ...
- PRML读书会第三章 Linear Models for Regression(线性基函数模型、正则化方法、贝叶斯线性回归等)
主讲人 planktonli planktonli(1027753147) 18:58:12 大家好,我负责给大家讲讲 PRML的第3讲 linear regression的内容,请大家多多指教,群 ...
- Logistic Regression and Gradient Descent
Logistic Regression and Gradient Descent Logistic regression is an excellent tool to know for classi ...
- 机器学习之多变量线性回归(Linear Regression with multiple variables)
1. Multiple features(多维特征) 在机器学习之单变量线性回归(Linear Regression with One Variable)我们提到过的线性回归中,我们只有一个单一特征量 ...
- 转载 Deep learning:三(Multivariance Linear Regression练习)
前言: 本文主要是来练习多变量线性回归问题(其实本文也就3个变量),参考资料见网页:http://openclassroom.stanford.edu/MainFolder/DocumentPage. ...
- 机器学习笔记-1 Linear Regression with Multiple Variables(week 2)
1. Multiple Features note:X0 is equal to 1 2. Feature Scaling Idea: make sure features are on a simi ...
随机推荐
- VueRouter和Vue生命周期(钩子函数)
一.vue-router路由 1.介绍 vue-router是Vue的路由系统,用于定位资源的,在页面不刷新的情况下切换页面内容.类似于a标签,实际上在页面上展示出来的也是a标签,是锚点.router ...
- Server酱
功能:从服务器推报警和日志到手机上的工具 使用: 发送消息非常简单,只需要向以下URL发一个GET或者POST请求: https://sc.ftqq.com/SCU34756Ta63843ce61a3 ...
- spring boot简单的小demo(适合于初学者)
import com.example.demo2.com.example.dao.ShopDao; import com.example.demo2.com.example.entity.Shops; ...
- jzoj6099. 【GDOI2019模拟2019.4.1】Dist
题目链接:https://jzoj.net/senior/#main/show/6099 考虑直接统计某个点到其它所有点的距离和 我们先把整个团当成一个点建图,处理出任意两个团之间的距离\(dis(i ...
- 【nowcoder-2017校招真题】保留最大的数
牛客在线编程-保留最大的数 题目描述 给定一个十进制的正整数number,选择从里面去掉一部分数字,希望保留下来的数字组成的正整数最大. 输入描述: 输入为两行内容,第一行是正整数number,1 ≤ ...
- 清北学堂Day3
卷积公式(Dirichlet卷积) 这个式子看上去就很变态,那么他是什么意思呢: 就是说 函数f(x)和g(x)对于n的卷积等于n的每一个因子d在f(x)上的值乘上d/n在g(x)上的值的和 例:设g ...
- RabbitMQ队列的使用
为什么要用RabbitMQ 以常见的订单系统为例,用户点击[下单]按钮之后的业务逻辑可能包括:扣减库存.生成相应单据.发红包.发短信通知.在业务发展初期这些逻辑可能放在一起同步执行,随着业务的发展订单 ...
- java 简单程序
public class a{ public static void main(String[] args) { System.out.println("Hello world") ...
- JS学习笔记Day8
一.内置函数Math 1.Math 1)Math.abs() 求绝对值 2)Math.PI 圆周率 2.求近似值: 1)Math.round() 四舍五入(负数: >0.5 进一 <=0. ...
- Entity Framework入门教程(13)---EF中的高并发
EF中的高并发 这里只介绍EF6中database-first开发方案的高并发解决方案,code-first开发方案中的高并发会在以后的EF CodeFirst系列中介绍. EF默认支持乐观并发:我们 ...