标签传播算法(Label Propagation)及Python实现
半监督学习(Semi-supervised learning)发挥作用的场合是:你的数据有一些有label,一些没有。而且一般是绝大部分都没有,只有少许几个有label。半监督学习算法会充分的利用unlabeled数据来捕捉我们整个数据的潜在分布。它基于三大假设:
半监督学习算法有很多,下面我们介绍最简单的标签传播算法(label propagation),最喜欢简单了,哈哈。
标签传播算法(label propagation)的核心思想非常简单:相似的数据应该具有相同的label。LP算法包括两大步骤:1)构造相似矩阵;2)勇敢的传播吧。
时间复杂度接近线性:对顶点分配标签的复杂度为O(n),每次迭代时间为O( m),找出所有社区的复杂度为O (n +m),但迭代次数难以估计
2)标签初始化改进,如提取一些较为紧密的子结构来作为标签传播的初始标签(非重叠最小极大团提取算法 orz。。。)或通过初始社区划分算法先确定社区的雏形再进行传播。
Pij表示从节点i转移到节点j的概率。假设有C个类和L个labeled样本,我们定义一个LxC的label矩阵YL,第i行表示第i个样本的标签指示向量,即如果第i个样本的类别是j,那么该行的第j个元素为1,其他为0。同样,我们也给U个unlabeled样本一个UxC的label矩阵YU。把他们合并,我们得到一个NxC的soft label矩阵F=[YL;YU]。soft label的意思是,我们保留样本i属于每个类别的概率,而不是互斥性的,这个样本以概率1只属于一个类。当然了,最后确定这个样本i的类别的时候,是取max也就是概率最大的那个类作为它的类别的。那F里面有个YU,它一开始是不知道的,那最开始的值是多少?无所谓,随便设置一个值就可以了。
我们知道,我们每次迭代都是计算一个soft label矩阵F=[YL;YU],但是YL是已知的,计算它没有什么用,在步骤2)的时候,还得把它弄回来。我们关心的只是YU,那我们能不能只计算YU呢?Yes。我们将矩阵P做以下划分:
所以我们也可以直接这样求解,以获得最终的YU。但是在实际的应用过程中,由于矩阵求逆需要O(n3)的复杂度,所以如果unlabeled数据非常多,那么I – PUU矩阵的求逆将会非常耗时,因此这时候一般选择迭代算法来实现。
- #***************************************************************************
- #*
- #* Description: label propagation
- #* Author: Zou Xiaoyi (zouxy09@qq.com)
- #* Date: 2015-10-15
- #* HomePage: http://blog.csdn.net/zouxy09
- #*
- #**************************************************************************
- import time
- import numpy as np
- # return k neighbors index
- def navie_knn(dataSet, query, k):
- numSamples = dataSet.shape[0]
- ## step 1: calculate Euclidean distance
- diff = np.tile(query, (numSamples, 1)) - dataSet
- squaredDiff = diff ** 2
- squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row
- ## step 2: sort the distance
- sortedDistIndices = np.argsort(squaredDist)
- if k > len(sortedDistIndices):
- k = len(sortedDistIndices)
- return sortedDistIndices[0:k]
- # build a big graph (normalized weight matrix)
- def buildGraph(MatX, kernel_type, rbf_sigma = None, knn_num_neighbors = None):
- num_samples = MatX.shape[0]
- affinity_matrix = np.zeros((num_samples, num_samples), np.float32)
- if kernel_type == 'rbf':
- if rbf_sigma == None:
- raise ValueError('You should input a sigma of rbf kernel!')
- for i in xrange(num_samples):
- row_sum = 0.0
- for j in xrange(num_samples):
- diff = MatX[i, :] - MatX[j, :]
- affinity_matrix[i][j] = np.exp(sum(diff**2) / (-2.0 * rbf_sigma**2))
- row_sum += affinity_matrix[i][j]
- affinity_matrix[i][:] /= row_sum
- elif kernel_type == 'knn':
- if knn_num_neighbors == None:
- raise ValueError('You should input a k of knn kernel!')
- for i in xrange(num_samples):
- k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors)
- affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors
- else:
- raise NameError('Not support kernel type! You can use knn or rbf!')
- return affinity_matrix
- # label propagation
- def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 1.5, \
- knn_num_neighbors = 10, max_iter = 500, tol = 1e-3):
- # initialize
- num_label_samples = Mat_Label.shape[0]
- num_unlabel_samples = Mat_Unlabel.shape[0]
- num_samples = num_label_samples + num_unlabel_samples
- labels_list = np.unique(labels)
- num_classes = len(labels_list)
- MatX = np.vstack((Mat_Label, Mat_Unlabel))
- clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)
- for i in xrange(num_label_samples):
- clamp_data_label[i][labels[i]] = 1.0
- label_function = np.zeros((num_samples, num_classes), np.float32)
- label_function[0 : num_label_samples] = clamp_data_label
- label_function[num_label_samples : num_samples] = -1
- # graph construction
- affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)
- # start to propagation
- iter = 0; pre_label_function = np.zeros((num_samples, num_classes), np.float32)
- changed = np.abs(pre_label_function - label_function).sum()
- while iter < max_iter and changed > tol:
- if iter % 1 == 0:
- print "---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed)
- pre_label_function = label_function
- iter += 1
- # propagation
- label_function = np.dot(affinity_matrix, label_function)
- # clamp
- label_function[0 : num_label_samples] = clamp_data_label
- # check converge
- changed = np.abs(pre_label_function - label_function).sum()
- # get terminate label of unlabeled data
- unlabel_data_labels = np.zeros(num_unlabel_samples)
- for i in xrange(num_unlabel_samples):
- unlabel_data_labels[i] = np.argmax(label_function[i+num_label_samples])
- return unlabel_data_labels
- import time
- import math
- import numpy as np
- from label_propagation import labelPropagation
- # show
- def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels):
- import matplotlib.pyplot as plt
- for i in range(Mat_Label.shape[0]):
- if int(labels[i]) == 0:
- plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr')
- elif int(labels[i]) == 1:
- plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db')
- else:
- plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy')
- for i in range(Mat_Unlabel.shape[0]):
- if int(unlabel_data_labels[i]) == 0:
- plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or')
- elif int(unlabel_data_labels[i]) == 1:
- plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob')
- else:
- plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy')
- plt.xlabel('X1'); plt.ylabel('X2')
- plt.xlim(0.0, 12.)
- plt.ylim(0.0, 12.)
- plt.show()
- def loadCircleData(num_data):
- center = np.array([5.0, 5.0])
- radiu_inner = 2
- radiu_outer = 4
- num_inner = num_data / 3
- num_outer = num_data - num_inner
- data = []
- theta = 0.0
- for i in range(num_inner):
- pho = (theta % 360) * math.pi / 180
- tmp = np.zeros(2, np.float32)
- tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0]
- tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1]
- data.append(tmp)
- theta += 2
- theta = 0.0
- for i in range(num_outer):
- pho = (theta % 360) * math.pi / 180
- tmp = np.zeros(2, np.float32)
- tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0]
- tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1]
- data.append(tmp)
- theta += 1
- Mat_Label = np.zeros((2, 2), np.float32)
- Mat_Label[0] = center + np.array([-radiu_inner + 0.5, 0])
- Mat_Label[1] = center + np.array([-radiu_outer + 0.5, 0])
- labels = [0, 1]
- Mat_Unlabel = np.vstack(data)
- return Mat_Label, labels, Mat_Unlabel
- def loadBandData(num_unlabel_samples):
- #Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]])
- #labels = [0, 1]
- #Mat_Unlabel = np.array([[5.1, 2.], [5.0, 8.1]])
- Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]])
- labels = [0, 1]
- num_dim = Mat_Label.shape[1]
- Mat_Unlabel = np.zeros((num_unlabel_samples, num_dim), np.float32)
- Mat_Unlabel[:num_unlabel_samples/2, :] = (np.random.rand(num_unlabel_samples/2, num_dim) - 0.5) * np.array([3, 1]) + Mat_Label[0]
- Mat_Unlabel[num_unlabel_samples/2 : num_unlabel_samples, :] = (np.random.rand(num_unlabel_samples/2, num_dim) - 0.5) * np.array([3, 1]) + Mat_Label[1]
- return Mat_Label, labels, Mat_Unlabel
- # main function
- if __name__ == "__main__":
- num_unlabel_samples = 800
- #Mat_Label, labels, Mat_Unlabel = loadBandData(num_unlabel_samples)
- Mat_Label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples)
- ## Notice: when use 'rbf' as our kernel, the choice of hyper parameter 'sigma' is very import! It should be
- ## chose according to your dataset, specific the distance of two data points. I think it should ensure that
- ## each point has about 10 knn or w_i,j is large enough. It also influence the speed of converge. So, may be
- ## 'knn' kernel is better!
- #unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.2)
- unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'knn', knn_num_neighbors = 10, max_iter = 400)
- show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels)
那为啥不切大矩阵PUU,而是切小点的矩阵FU,因为大矩阵PUU没法独立分块,并行的一个原则是处理必须是独立的。 矩阵FU依赖的是所有的U,而把PUU切开分发到其他节点的时候,每次FU的更新都需要和其他的节点通信,这个通信的代价是很大的(实际上,很多并行系统没法达到线性的加速度的瓶颈是通信!线性加速比是,我增加了n台机器,速度就提升了n倍)。但是对类别C也就是矩阵FU切分,就不会有这个问题,因为他们的计算是独立的。只是决定样本的最终类别的时候,将所有的FU收集回来求max就可以了。
- import os, sys, time
- import numpy as np
- from scipy.sparse import csr_matrix, lil_matrix, eye
- import operator
- import cPickle as pickle
- import mpi4py.MPI as MPI
- #
- # Global variables for MPI
- #
- # instance for invoking MPI related functions
- # the node rank in the whole community
- comm_rank = comm.Get_rank()
- # the size of the whole community, i.e., the total number of working nodes in the MPI cluster
- comm_size = comm.Get_size()
- # load mnist dataset
- def load_MNIST():
- import gzip
- f = gzip.open("mnist.pkl.gz", "rb")
- train, val, test = pickle.load(f)
- f.close()
- Mat_Label = train[0]
- labels = train[1]
- Mat_Unlabel = test[0]
- groundtruth = test[1]
- labels_id = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
- return Mat_Label, labels, labels_id, Mat_Unlabel, groundtruth
- # return k neighbors index
- def navie_knn(dataSet, query, k):
- numSamples = dataSet.shape[0]
- ## step 1: calculate Euclidean distance
- diff = np.tile(query, (numSamples, 1)) - dataSet
- squaredDiff = diff ** 2
- squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row
- ## step 2: sort the distance
- sortedDistIndices = np.argsort(squaredDist)
- if k > len(sortedDistIndices):
- k = len(sortedDistIndices)
- return sortedDistIndices[0:k]
- # build a big graph (normalized weight matrix)
- # sparse U x (U + L) matrix
- def buildSubGraph(Mat_Label, Mat_Unlabel, knn_num_neighbors):
- num_unlabel_samples = Mat_Unlabel.shape[0]
- data = []; indices = []; indptr = [0]
- Mat_all = np.vstack((Mat_Label, Mat_Unlabel))
- values = np.ones(knn_num_neighbors, np.float32) / knn_num_neighbors
- for i in xrange(num_unlabel_samples):
- k_neighbors = navie_knn(Mat_all, Mat_Unlabel[i, :], knn_num_neighbors)
- indptr.append(np.int32(indptr[-1]) + knn_num_neighbors)
- indices.extend(k_neighbors)
- data.append(values)
- return csr_matrix((np.hstack(data), indices, indptr))
- # build a big graph (normalized weight matrix)
- # sparse U x (U + L) matrix
- def buildSubGraph_MPI(Mat_Label, Mat_Unlabel, knn_num_neighbors):
- num_unlabel_samples = Mat_Unlabel.shape[0]
- local_data = []; local_indices = []; local_indptr = [0]
- Mat_all = np.vstack((Mat_Label, Mat_Unlabel))
- values = np.ones(knn_num_neighbors, np.float32) / knn_num_neighbors
- sample_offset = np.linspace(0, num_unlabel_samples, comm_size + 1).astype('int')
- for i in range(sample_offset[comm_rank], sample_offset[comm_rank+1]):
- k_neighbors = navie_knn(Mat_all, Mat_Unlabel[i, :], knn_num_neighbors)
- local_indptr.append(np.int32(local_indptr[-1]) + knn_num_neighbors)
- local_indices.extend(k_neighbors)
- local_data.append(values)
- data = np.hstack(comm.allgather(local_data))
- indices = np.hstack(comm.allgather(local_indices))
- indptr_tmp = comm.allgather(local_indptr)
- indptr = []
- for i in range(len(indptr_tmp)):
- if i == 0:
- indptr.extend(indptr_tmp[i])
- else:
- last_indptr = indptr[-1]
- del(indptr[-1])
- indptr.extend(indptr_tmp[i] + last_indptr)
- return csr_matrix((np.hstack(data), indices, indptr), dtype = np.float32)
- # label propagation
- def run_label_propagation_sparse(knn_num_neighbors = 20, max_iter = 100, tol = 1e-4, test_per_iter = 1):
- # load data and graph
- print "Processor %d/%d loading graph file..." % (comm_rank, comm_size)
- #Mat_Label, labels, Mat_Unlabel, groundtruth = loadFourBandData()
- Mat_Label, labels, labels_id, Mat_Unlabel, unlabel_data_id = load_MNIST()
- if comm_size > len(labels_id):
- raise ValueError("Sorry, the processors must be less than the number of classes")
- #affinity_matrix = buildSubGraph(Mat_Label, Mat_Unlabel, knn_num_neighbors)
- affinity_matrix = buildSubGraph_MPI(Mat_Label, Mat_Unlabel, knn_num_neighbors)
- # get some parameters
- num_classes = len(labels_id)
- num_label_samples = len(labels)
- num_unlabel_samples = Mat_Unlabel.shape[0]
- affinity_matrix_UL = affinity_matrix[:, 0:num_label_samples]
- affinity_matrix_UU = affinity_matrix[:, num_label_samples:num_label_samples+num_unlabel_samples]
- if comm_rank == 0:
- print "Have %d labeled images, %d unlabeled images and %d classes" % (num_label_samples, num_unlabel_samples, num_classes)
- # divide label_function_U and label_function_L to all processors
- class_offset = np.linspace(0, num_classes, comm_size + 1).astype('int')
- # initialize local label_function_U
- local_start_class = class_offset[comm_rank]
- local_num_classes = class_offset[comm_rank+1] - local_start_class
- local_label_function_U = eye(num_unlabel_samples, local_num_classes, 0, np.float32, format='csr')
- # initialize local label_function_L
- local_label_function_L = lil_matrix((num_label_samples, local_num_classes), dtype = np.float32)
- for i in xrange(num_label_samples):
- class_off = int(labels[i]) - local_start_class
- if class_off >= 0 and class_off < local_num_classes:
- local_label_function_L[i, class_off] = 1.0
- local_label_function_L = local_label_function_L.tocsr()
- local_label_info = affinity_matrix_UL.dot(local_label_function_L)
- print "Processor %d/%d has to process %d classes..." % (comm_rank, comm_size, local_label_function_L.shape[1])
- # start to propagation
- iter = 1; changed = 100.0;
- evaluation(num_unlabel_samples, local_start_class, local_label_function_U, unlabel_data_id, labels_id)
- while True:
- pre_label_function = local_label_function_U.copy()
- # propagation
- local_label_function_U = affinity_matrix_UU.dot(local_label_function_U) + local_label_info
- # check converge
- local_changed = abs(pre_label_function - local_label_function_U).sum()
- changed = comm.reduce(local_changed, root = 0, op = MPI.SUM)
- status = 'RUN'
- test = False
- if comm_rank == 0:
- if iter % 1 == 0:
- norm_changed = changed / (num_unlabel_samples * num_classes)
- print "---> Iteration %d/%d, changed: %f" % (iter, max_iter, norm_changed)
- if iter >= max_iter or changed < tol:
- status = 'STOP'
- print "************** Iteration over! ****************"
- if iter % test_per_iter == 0:
- test = True
- iter += 1
- test = comm.bcast(test if comm_rank == 0 else None, root = 0)
- status = comm.bcast(status if comm_rank == 0 else None, root = 0)
- if status == 'STOP':
- break
- if test == True:
- evaluation(num_unlabel_samples, local_start_class, local_label_function_U, unlabel_data_id, labels_id)
- evaluation(num_unlabel_samples, local_start_class, local_label_function_U, unlabel_data_id, labels_id)
- def evaluation(num_unlabel_samples, local_start_class, local_label_function_U, unlabel_data_id, labels_id):
- # get local label with max score
- if comm_rank == 0:
- print "Start to combine local result..."
- local_max_score = np.zeros((num_unlabel_samples, 1), np.float32)
- local_max_label = np.zeros((num_unlabel_samples, 1), np.int32)
- for i in xrange(num_unlabel_samples):
- local_max_label[i, 0] = np.argmax(local_label_function_U.getrow(i).todense())
- local_max_score[i, 0] = local_label_function_U[i, local_max_label[i, 0]]
- local_max_label[i, 0] += local_start_class
- # gather the results from all the processors
- if comm_rank == 0:
- print "Start to gather results from all processors"
- all_max_label = np.hstack(comm.allgather(local_max_label))
- all_max_score = np.hstack(comm.allgather(local_max_score))
- # get terminate label of unlabeled data
- if comm_rank == 0:
- print "Start to analysis the results..."
- right_predict_count = 0
- for i in xrange(num_unlabel_samples):
- if i % 1000 == 0:
- print "***", all_max_score[i]
- max_idx = np.argmax(all_max_score[i])
- max_label = all_max_label[i, max_idx]
- if int(unlabel_data_id[i]) == int(labels_id[max_label]):
- right_predict_count += 1
- accuracy = float(right_predict_count) * 100.0 / num_unlabel_samples
- print "Have %d samples, accuracy: %.3f%%!" % (num_unlabel_samples, accuracy)
- if __name__ == '__main__':
- run_label_propagation_sparse(knn_num_neighbors = 20, max_iter = 30)
