python机器学习-乳腺癌细胞挖掘（博主亲自录制视频）

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

项目联系QQ：231469242

python 2.7

# -*- coding: utf-8 -*-

from statsmodels.stats.multicomp import (pairwise_tukeyhsd,

                                         MultiComparison)

# Import standard packages

import numpy as np

from scipy import stats

import pandas as pd

import variance_check

#数据excel名

excel="sample.xlsx"

#读取数据

df=pd.read_excel(excel)

#获取第一组数据，结构为列表

group_mentaln=list(df.StressReduction[(df.Treatment=="mental")])

group_physical=list(df.StressReduction[(df.Treatment=="physical")])

group_medical=list(df.StressReduction[(df.Treatment=="medical")])

list_groups=[group_mentaln,group_physical,group_medical]

list_total=group_mentaln+group_physical+group_medical

print"equal test-----------------------------------------------------"

# #比较组内的样本是否相等，如果不相等，不适合于tukey等方法

equal_lenth=variance_check.Equal_lenth(list_groups)

if equal_lenth==False:

    print("the length of groups are not equal")                               

multiComp = MultiComparison(df['StressReduction'], df['Treatment'])

tukey=multiComp.tukeyhsd()

summary=multiComp.tukeyhsd().summary()

print(summary) 

q=tukey.q_crit

print("q values:",q)

'''

q值

Out[41]: 3.5057698487864877

'''

'''

Multiple Comparison of Means - Tukey HSD,FWER=0.05

===============================================

 group1  group2  meandiff  lower  upper  reject

-----------------------------------------------

medical  mental    1.5     0.3217 2.6783  True

medical physical   1.0    -0.1783 2.1783 False

 mental physical   -0.5   -1.6783 0.6783 False

-----------------------------------------------

'''

print("data details:",summary.data)

'''

[['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject'],

[u'medical', u'mental', 1.5, 0.32169999999999999, 2.6783000000000001, True],

[u'medical', u'physical', 1.0, -0.17829999999999999, 2.1783000000000001, False],

[u'mental', u'physical', -0.5, -1.6782999999999999, 0.67830000000000001, False]]

'''

variance_check.py

# -*- coding: utf-8 -*-

'''

用于方差齐性检验

正太性检验

配对相等检验

'''

import scipy,math

from scipy.stats import f

import numpy as np

import matplotlib.pyplot as plt

import scipy.stats as stats

# additional packages

from statsmodels.stats.diagnostic import lillifors

#多重比较

from statsmodels.sandbox.stats.multicomp import multipletests

#用于排列组合

import itertools

'''

#测试数据

group1=[2,3,7,2,6]

group2=[10,8,7,5,10]

group3=[10,13,14,13,15]

list_groups=[group1,group2,group3]

list_total=group1+group2+group3

'''

a=0.05

#正态分布测试

def check_normality(testData):

    #20<样本数<50用normal test算法检验正态分布性

    if 20<len(testData) <50:

       p_value= stats.normaltest(testData)[1]

       if p_value<0.05:

           print"use normaltest"

           print "data are not normal distributed"

           return  False

       else:

           print"use normaltest"

           print "data are normal distributed"

           return True

    #样本数小于50用Shapiro-Wilk算法检验正态分布性

    if len(testData) <50:

       p_value= stats.shapiro(testData)[1]

       if p_value<0.05:

           print "use shapiro:"

           print "data are not normal distributed"

           return  False

       else:

           print "use shapiro:"

           print "data are normal distributed"

           return True

    if 300>=len(testData) >=50:

       p_value= lillifors(testData)[1]

       if p_value<0.05:

           print "use lillifors:"

           print "data are not normal distributed"

           return  False

       else:

           print "use lillifors:"

           print "data are normal distributed"

           return True

    if len(testData) >300:

       p_value= stats.kstest(testData,'norm')[1]

       if p_value<0.05:

           print "use kstest:"

           print "data are not normal distributed"

           return  False

       else:

           print "use kstest:"

           print "data are normal distributed"

           return True

#对所有样本组进行正态性检验

def NormalTest(list_groups):

    for group in list_groups:

        #正态性检验

        status=check_normality(group)

        if status==False :

            return False

    return True

#排列组合函数

def Combination(list_groups):

    combination= []

    for i in range(1,len(list_groups)+1):

        iter = itertools.combinations(list_groups,i)

        combination.append(list(iter))

    #需要排除第一个和最后一个

    return combination[1:-1][0]

'''

Out[57]:

[[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]),

  ([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]),

  ([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]]

'''       

#方差齐性检测

def Levene_test(group1,group2,group3):

    leveneResult=scipy.stats.levene(group1,group2,group3)

    p=leveneResult[1]

    print"levene test:"

    if p<0.05:

        print"variances of groups are not equal"

        return False

    else:

        print"variances of groups are equal"

        return True

'''

H0成立，三组数据方差无显著差异

Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711)

'''

#比较组内的样本是否相等，如果不相等，不适合于tukey等方法

#此函数有问题，无法解决nan排除

def Equal_lenth(list_groups):

    list1=list_groups[0]

    list2=list_groups[1]

    list3=list_groups[2]

    list1_removeNan=[x for x in list1 if str(x) != 'nan' and str(x)!= '-inf']

    list2_removeNan=[x for x in list2 if str(x) != 'nan' and str(x)!= '-inf']

    list3_removeNan=[x for x in list3 if str(x) != 'nan' and str(x)!= '-inf']

    len1=len(list1_removeNan)

    len2=len(list2_removeNan)

    len3=len(list3_removeNan)

    if len1==len2==len3:

        return True

    else:

        return False

'''

#返回True or false

normality=NormalTest(list_groups)

leveneResult=Levene_test(list_groups[0],list_groups[1],list_groups[2])

'''

数据sample.xlsx

https://en.wikipedia.org/wiki/Tukey's_range_test

Tukey's range test, also called Tukey method, Tukey's honest significance test, Tukey's HSD (Honestly Significant Difference) test

老鼠试验数据

公式

D_turkey表示平均数差值的关键值，任何大于 D_turkey值的平均数差值都是显著的

第四组和第五组平均数差值是不显著的，其它组的差值是显著的

Studentized Range q Table

a=0.05

http://www.real-statistics.com/statistics-tables/studentized-range-q-table/

q值

q值是一个残差化范围统计数据表格值；由平均数的数量和组内自由度数量交互决定

a表示分类组数，df表示所有数量自由度，a_fw表示0.05犯错概率

MS_S/A 表示 within group的方差

s_m值

s_m是一个标准误

n表示组数

结果

Tukey's range test, also known as the Tukey's test, Tukey method, Tukey's honest significance test, Tukey's HSD (honest significant difference) test,^[1] or the Tukey–Kramer method, is a single-step multiple comparison procedure and statistical test. It can be used on raw data or in conjunction with an ANOVA (post-hoc analysis) to find means that are significantly different from each other. Named after John Tukey,^[2] it compares all possible pairs of means, and is based on a studentized range distribution (q) (this distribution is similar to the distribution of t from the t-test. See below).^[3] The Tukey HSD tests should not be confused with the Tukey Mean Difference tests (also known as the Bland–Altman diagram).

Tukey's test compares the means of every treatment to the means of every other treatment; that is, it applies simultaneously to the set of all pairwise comparisons

μ i − μ j {\displaystyle \mu _{i}-\mu _{j}\,}

and id　　entifies any difference between two means that is greater than the expected standard error. The confidence coefficient for the set, when all sample sizes are equal, is exactly 1 − α. For unequal sample sizes, the confidence coefficient is greater than 1 − α. In other words, the Tukey method is conservative when there are unequal sample sizes.

Assumptions of Tukey's test

前提条件：

样本独立性+样本正态分布+所有组方差齐性

The observations being tested are independent within and among the groups.
The groups associated with each mean in the test are normally distributed.
There is equal within-group variance across the groups associated with each mean in the test (homogeneity of variance).

The test statistic

Tukey's test is based on a formula very similar to that of the t-test. In fact, Tukey's test is essentially a t-test, except that it corrects for family-wise error rate (when there are multiple comparisons being made, the probability of making a Type I error within at least one of the comparisons, increases — Tukey's test corrects for that, and is thus more suitable for multiple comparisons than a number of t-tests would be).^[3]

The formula for Tukey's test is:

q s = Y A − Y B S E , {\displaystyle q_{s}={\frac {Y_{A}-Y_{B}}{SE}},}

where Y_A is the larger of the two means being compared, Y_B is the smaller of the two means being compared, and SE is the standard error of the data in question.

This q_s value can then be compared to a q value from the studentized range distribution. If the q_s value is larger than the q_critical value obtained from the distribution, the two means are said to be significantly different.^[3]

Since the null hypothesis for Tukey's test states that all means being compared are from the same population (i.e. μ₁ = μ₂ = μ₃ = ... = μ_k), the means should be normally distributed (according to the central limit theorem). This gives rise to the normality assumption of Tukey's test.

The studentized range (q) distribution

The Tukey method uses the studentized range distribution. Suppose that we take a sample of size n from each of k populations with the same normal distribution N(μ, σ) and suppose that y ¯ {\displaystyle {\bar {y}}} _min is the smallest of these sample means and y ¯ {\displaystyle {\bar {y}}} _max is the largest of these sample means, and suppose S² is the pooled sample variance from these samples. Then the following random variable has a Studentized range distribution.

q = ( y ¯ max − y ¯ min ) S 2 / n {\displaystyle q={\frac {({\overline {y}}_{\max }-{\overline {y}}_{\min })}{S{\sqrt {2/n}}}}}

This value of q is the basis of the critical value of q, based on three factors:

α (the Type I error rate, or the probability of rejecting a true null hypothesis)
k (the number of populations)
df (the number of degrees of freedom (N-k) where N is the total number of observations)

The distribution of q has been tabulated and appears in many textbooks on statistics. In some tables the distribution of q has been tabulated without the 2 {\displaystyle {\sqrt {2}}} factor. To understand which table it is, we can compute the result for k=2 and compare it to the result of the Student's t-distribution with the same degrees of freedom and the same α. In addition, R offers a cumulative distribution function (ptukey) and a quantile function (qtukey) for q.

Confidence limits

The Tukey confidence limits for all pairwise comparisons with confidence coefficient of at least 1 − α are

y ¯ i ∙ − y ¯ j ∙ ± q α ; k ; N − k 2 σ ^ ε 2 n i , j = 1 , … , k i ≠ j . {\displaystyle {\bar {y}}_{i\bullet }-{\bar {y}}_{j\bullet }\pm {\frac {q_{\alpha ;k;N-k}}{\sqrt {2}}}{\widehat {\sigma }}_{\varepsilon }{\sqrt {\frac {2}{n}}}\qquad i,j=1,\ldots ,k\quad i\neq j.}

Notice that the point estimator and the estimated variance are the same as those for a single pairwise comparison. The only difference between the confidence limits for simultaneous comparisons and those for a single comparison is the multiple of the estimated standard deviation.

Also note that the sample sizes must be equal when using the studentized range approach. σ ^ ε {\displaystyle {\widehat {\sigma }}_{\varepsilon }} is the standard deviation of the entire design, not just that of the two groups being compared. It is possible to work with unequal sample sizes. In this case, one has to calculate the estimated standard deviation for each pairwise comparison as formalized by Clyde Kramer in 1956, so the procedure for unequal sample sizes is sometimes referred to as the Tukey–Kramer method which is as follows:

y ¯ i ∙ − y ¯ j ∙ ± q α ; k ; N − k 2 σ ^ ε 1 n i + 1 n j {\displaystyle {\bar {y}}_{i\bullet }-{\bar {y}}_{j\bullet }\pm {\frac {q_{\alpha ;k;N-k}}{\sqrt {2}}}{\widehat {\sigma }}_{\varepsilon }{\sqrt {{\frac {1}{n}}_{i}+{\frac {1}{n}}_{j}}}\qquad }

where n_i and n_j are the sizes of groups i and j respectively. The degrees of freedom for the whole design is also applied.

Advantages and disadvantages

When doing all pairwise comparisons, this method is considered the best available when confidence intervals are needed or sample sizes are not equal. When samples sizes are equal and confidence intervals are not needed Tukey’s test is slightly less powerful than the stepdown procedures, but if they are not available Tukey’s is the next-best choice, and unless the number of groups is large, the loss in power will be slight. In the general case when many or all contrasts might be of interest, Scheffé's method tends to give narrower confidence limits and is therefore the preferred method.

https://github.com/thomas-haslwanter/statsintro_python/tree/master/ISP/Code_Quantlets/08_TestsMeanValues/multipleTesting

https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149（欢迎关注博主主页，学习python视频资源，还有大量免费python经典文章）

Turkey HSD检验法/W法的更多相关文章

MT【330】u,v,w法
已知$a^2+b^2+c^2=1$求$abc(a+b+c)$的最小值.(2018辽宁预赛解答压轴题) 不妨设$a+b+c=3u,ab+bc+ca=3v^2,abc=w^3$,令$u^2=tv^2$要求 ...
进程池、tornado、字体
协程: import grequests from fake_useragent import UserAgent urls=[f'http://bir删d.so/search?page={p ...
CF891C Envy 最小生成树/虚树
正解:最小生成树/虚树解题报告: 传送门! sd如我就只想到了最暴力的想法,一点儿优化都麻油想到,,,真的菜到爆炸了QAQ 然后就分别港下两个正解QAQ 法一,最小生成树这个主要是要想到关于最小生 ...
python数据挖掘介绍
目录一:什么是数据挖掘二:数据挖掘的基本任务三:数据挖掘流程四:数据挖掘建模工具在python对数据的处理方式中,数据挖掘和数据分析是两个重要的方式,目的是为了从数据中获取具有科研或者商 ...
lucene入门创建索引——（二）
1.程序宏观结构图
VMWare里安装64位Linux 的方法
1.CPU AMD系列的CPU略过 Intel系列的CPU芯片需要支持EM64T和VT技术才行,并且BIOS也要支持才可以. 为了确定你的Intel CPU是否支持VT,请查看: http://com ...
一文弄懂神经网络中的反向传播法——BackPropagation
最近在看深度学习的东西,一开始看的吴恩达的UFLDL教程,有中文版就直接看了,后来发现有些地方总是不是很明确,又去看英文版,然后又找了些资料看,才发现,中文版的译者在翻译的时候会对省略的公式推导过程进 ...
机器学习——支持向量机(SVM)之拉格朗日乘子法，KKT条件以及简化版SMO算法分析
SVM有很多实现,现在只关注其中最流行的一种实现,即序列最小优化(Sequential Minimal Optimization,SMO)算法,然后介绍如何使用一种核函数(kernel)的方式将SVM ...
scikit-learn K近邻法类库使用小结
在K近邻法(KNN)原理小结这篇文章,我们讨论了KNN的原理和优缺点,这里我们就从实践出发,对scikit-learn 中KNN相关的类库使用做一个小结.主要关注于类库调参时的一个经验总结. 1. s ...

随机推荐

leetcode个人题解——#33 Search in Rotated Sorted Array
思路:每次取中间元素,一定有一半有序,另一半部分有序,有序的部分进行二分查找,部分有序的部分递归继续处理. class Solution { public: ; int middleSearch(in ...
Alpha冲刺——第七天
Alpha第七天听说 031502543 周龙荣(队长) 031502615 李家鹏 031502632 伍晨薇 031502637 张柽 031502639 郑秦 1.前言任务分配是VV.ZQ. ...
Scrum Meeting Beta - 3
Scrum Meeting Beta - 3 NewTeam 2017/12/1 地点:新主楼F座二楼任务反馈团队成员完成任务计划任务安万贺完成布局方面的界面优化Issue #125 李奕 ...
JavaScript数组去重的四种方法
今天,洗澡的想一个有趣的问题,使用js给数组去重,我想了四种方法,虽然今天的任务没有完成,5555: 不多说,po代码: //方法一:简单循环去重 Array.prototype.unique1 ...
beta-review阶段贡献分分配
小组名称:飞天小女警项目名称:礼物挑选小工具小组成员:沈柏杉(组长).程媛媛.杨钰宁.谭力铭 bera-review阶段各组员的贡献分分配如下: 姓名团队贡献分程媛媛 5.8 沈柏杉 6.1 ...
Spring Security 入门详解
序:本文主要参考 spring实战对里面的知识做一个梳理 1.Spring Security介绍 Spring Security是基于spring的应用程序提供声明式安全保护的安全性框架,它提供了完 ...
JVM初识、调优
JVM是按照运行时数据的存储结构来划分内存结构的,JVM在运行java时,将他们划分成几种不同格式的数据,分别存储在不同的区域,这些数据统一称为运行时数据,运行时数据包括java程序本身的数据信息和J ...
jumpserver的安装部署
废话不说直接安装 1:安装数据库这里是提前安装,也可以不安装,在安装jumpserver主程序的时候,他会询问你是否安装 yum -y install ncurses-devel cmake ech ...
MySQL专题 2 数据库优化 Slow Query log
MySQL Server 有四种类型的日志——Error Log.General Query Log.Binary Log 和 Slow Query Log. 第一个是错误日志,记录 mysqld 的 ...
delphi 中如何执行SqlParameter形式的SQL语句
procedure TForm1.Button1Click(Sender: TObject); begin ADOConnection1.Open('); ADOQuery1.Close; ADOQu ...

Turkey HSD检验法/W法