
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip


In [3]: user_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.user")

In [4]: user_data.first()

Out[4]: u'1|24|M|technician|85711'

In [5]: user_fields=user_data.map(lambda line: line.split("|"))

In [8]: num_users = user_fields.map(lambda fields: fields[0]).count()

In [10]: num_genders=user_fields.map(lambda fields: fields[2]).distinct().count()

In [11]: num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()

In [12]: num_zIpcodes=user_fields.map(lambda fields: fields[4]).distinct().count()

In [16]: print "Users: %d, genders: %d, occupations: %d, zip codes: %d" %(num_users, num_genders, num_occupations, num_zipcodes)
Users: 943, genders: 2, occupations: 21, zip codes: 795

In [17]: ages = user_fields.map(lambda x: int(x[1])).collect()

In [18]: hist(ages, bins=20, color='lightblue', normed=True)
n [19]: fig = matplotlib.pyplot.gcf()

In [20]: fig.set_size_inches(16, 10)

In [23]: count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()

In [24]: import numpy as np

In [25]: x_axis1 = np.array([c[0] for c in count_by_occupation])

In [26]: y_axis1 = np.array([c[1] for c in count_by_occupation])

In [27]: x_axis = x_axis1[np.argsort(x_axis1)]

In [28]: y_axis = y_axis1[np.argsort(y_axis1)]

In [29]: pos = np.arange(len(x_axis))

In [30]: width = 1.0

In [31]: ax = plt.axes()

In [32]: ax.set_xticks(pos + (width / 2))
In [34]: ax.set_xticklabels(x_axis)
In [35]: plt.bar(pos, y_axis, width, color='lightblue')
Out[35]: <Container object of 21 artists>

In [36]: plt.xticks(rotation=30)
In [37]: fig = matplotlib.pyplot.gcf()

In [38]: fig.set_size_inches(16, 10)

In [39]: count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()

In [46]: print "Map-reduce approach: "
Map-reduce approach:

In [47]: print dict(count_by_occupation)
{u'administrator': 79, u'writer': 45, u'retired': 14, u'lawyer': 12, u'doctor': 7, u'marketing': 26, u'executive': 32, u'none': 9, u'entertainment': 18, u'healthcare': 16, u'scientist': 31, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'homemaker': 7, u'engineer': 67}

In [48]: print ""

In [49]: print "countByValue approach:"
countByValue approach:

In [50]: print dict(count_by_occupation2)
{u'administrator': 79, u'retired': 14, u'lawyer': 12, u'healthcare': 16, u'marketing': 26, u'executive': 32, u'scientist': 31, u'student': 196, u'technician': 27, u'librarian': 51, u'programmer': 66, u'salesman': 12, u'homemaker': 7, u'engineer': 67, u'none': 9, u'doctor': 7, u'writer': 45, u'entertainment': 18, u'other': 105, u'educator': 95, u'artist': 28}

In [55]: def convert_year(x):
....: try:
....: return int(x[-4:])
....: except:
....: return 1990

In [56]: movie_fields = movie_data.map(lambda lines: lines.split("|"))

In [57]: years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))

In [58]: years_filtered = years.filter(lambda x: x != 1900)

In [59]: movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue()

In [60]: values = movie_ages.values()

In [61]: bins = movie_ages.keys()

In [63]: fig = matplotlib.pyplot.gcf()

In [64]: fig.set_size_inches(16, 10)

In [65]: rating_data = sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.data")

In [66]: print rating_data.first()
196 242 3 881250949

In [67]: num_ratings = rating_data.count()

In [68]: print "Ratings: %d " % num_ratings
Ratings: 100000

In [83]: ratings_per_movie = num_ratings / num_movies

In [84]: print "Min ratings: %d" % min_rating
Min ratings: 1

In [85]: print "Max ratings: %d" % max_rating
Max ratings: 5

In [86]: print "Average rating: %2.2f" % mean_rating
Average rating: 3.00

In [87]: print "Median rating: %d" % mean_rating
Median rating: 3

In [88]: print "Average # of ratings per user: %2.2f" % ratings_per_uer
Average # of ratings per user: 106.00

In [89]: print "Average # of ratings per movie: %2.2f" % ratings_per_movie
Average # of ratings per movie: 59.00

In [90]: ratings.stats()
Out[90]: (count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)

In [91]: count_by_rating = ratings.countByValue()

In [92]: x_axis = np.array(count_by_rating.keys())

In [93]: y_axis = np.array([float(c) for c in count_by_rating.values()])

In [94]: y_axis_normed = y_axis / y_axis.sum()

In [95]: pos = np.arange(len(x_axis))

In [96]: width = 1.0

In [97]: ax = plt.axes()

In [98]: ax.set_xticks(pos + (width / 2))
In [99]: ax.set_xticklabels(x_axis)
In [100]:

In [100]: plt.bar(pos, y_axis_normed, width, color='lightblue')
Out[100]: <Container object of 5 artists>

In [101]: plt.xticks(rotation=30)
Out[101]: (array([ 0.5, 1.5, 2.5, 3.5, 4.5]), <a list of 5 Text xticklabel objects>)

In [102]: fig = matplotlib.pyplot.gcf()

In [103]: fig.set_size_inches(16, 10)

In [104]: user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).groupByKey()

In [105]: user_ratings_by_user = user_ratings_grouped.map(lambda (k, v): (k, len(v)))

In [106]: user_ratings_by_user.take(5)
Out[106]: [(2, 62), (4, 24), (6, 211), (8, 59), (10, 184)]

In [107]: user_ratings_by_user_local = user_ratings_by_user.map(lambda (k, v): v).collect()

In [108]: hist(user_ratings_by_user_local, bins=200, color='lightblue', normed=True)
<a list of 200 Patch objects>)

In [109]: fig = matplotlib.pyplot.gcf()

In [110]: fig.set_size_inches(16, 10)

3.3. 处理与转换数据;

In [112]: years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect()

In [113]: years_pre_processed_array = np.array(years_pre_processed)

In [114]: mean_year = np.mean(years_pre_processed_array[years_pre_processed_array != 1900])

In [115]: median_year = np.median(years_pre_processed_array[years_pre_processed_array != 1900])

In [122]: index_bad_data = np.where(years_pre_processed_array == 1900)[0]

In [123]: index_bad_data
Out[123]: array([], dtype=int64)

In [124]: years_pre_processed_array[index_bad_data] = median_year

In [125]: print "Mean year of release: %d" % mean_year
Mean year of release: 1989

In [126]: print "Median year of release: %d" % median_year
Median year of release: 1995

In [130]: print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0]
Index of '1900' after assigning median: []


In [131]: all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()

In [132]: all_occupations.sort()

In [133]:

In [133]: idx = 0

In [134]: all_occupations_dict = {}

In [135]: for o in all_occupations:
.....: all_occupations_dict[o] = idx
.....: idx += 1

In [136]: print "Encoding of 'doctor': %d" %all_occupations_dict['doctor']
Encoding of 'doctor': 2

In [137]: print "Encoding of 'programmer': %d" %all_occupations_dict['programmer']
Encoding of 'programmer': 14

In [139]: k = len(all_occupations_dict)

In [140]: binary_x = np.zeros(k)

In [141]: k_programmer = all_occupations_dict['programmer']

In [142]: binary_x[k_programmer] = 1

In [143]: print "Binary feature vector: %s" %binary_x
Binary feature vector: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0.]

In [144]: print "Length of binary vector: %d" %k
Length of binary vector: 21

In [145]: def extract_datetime(ts):
.....: import datetime
.....: return datetime.datetime.fromtimestamp(ts)

In [149]: timestamps = rating_data.map(lambda fields: int(fields[3]))

In [150]: hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)

In [151]: hour_of_day.take(5)
Out[151]: [23, 3, 15, 13, 13]

In [154]: def assign_tod(hr):
.....: times_of_day = {
.....: 'morning' : range(7, 12),
.....: 'lunch' : range(12, 14),
.....: 'afternoon' : range(14, 18),
.....: 'evening' : range(18, 23),
.....: 'night' : range(23, 7)
.....: }
.....: for k, v in times_of_day.iteritems():
.....: if hr in v:
.....: return k

In [166]: def assign_tod(hr):
.....: times_of_day = {
.....: 'morning' : range(7, 12),
.....: 'lunch' : range(12, 14),
.....: 'afternoon' : range(14, 18),
.....: 'evening' : range(18, 23),
.....: 'night' : range(23, 24) + range(0, 7)
.....: }
.....: for k, v in times_of_day.iteritems():
.....: if hr in v:
.....: return k

In [167]:

In [167]: time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))

In [168]: time_of_day.take(5)
Out[168]: ['night', 'night', 'afternoon', 'lunch', 'lunch']

In [170]: def extract_titile(raw):
.....: import re
.....: grps = re.search("\((\w+)\)", raw)
.....: if grps:
.....: return raw[:grps.start()].strip()
.....: else:
.....: return raw

In [171]: raw_titles = movie_fields.map(lambda fields: fields[1])

In [172]: for raw_title in raw_titles.take(5):
.....: print extract_titile(raw_title)
Toy Story
Four Rooms
Get Shorty

In [173]: movie_titles = raw_titles.map(lambda m: extract_titile(m))

In [174]: title_terms = movie_titles.map(lambda t: t.split(" "))

In [175]: print title_terms.take(5)
[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]

In [176]: all_terms = title_terms.flatMap(lambda x: x).distinct().collect()

In [177]: idx = 0

In [178]: all_terms_dict = {}

In [179]: for term in all_terms:
.....: all_occupations_dict[term] = idx
.....: idx += 1

In [180]: print "Total number of terms: %d" % len(all_terms_dict)
Total number of terms: 0

In [181]: print "Index of term 'Dead': %d" % all_occupations_dict['Dead']
Index of term 'Dead': 147

In [182]: print "Index of term 'Rooms': %d" % all_occupations_dict['Rooms']
Index of term 'Rooms': 1963

In [184]: %paste

def create_vector(terms, term_dict):
from scipy import sparse as sp
num_terms = len(term_dict)
x = sp.csc_matrix((1, num_terms))
for t in terms:
if t in term_dict:
idx = term_dict[t]
x[0, idx] = 1
return x
## -- End pasted text --

In [185]:

In [185]: all_terms_bcast = sc.broadcast(all_terms_dict)

In [186]: term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))

In [187]: term_vectors.take(5)
[<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>]

In [188]: np.random.seed(42)

In [189]: x = np.random.randn(10)

In [190]: norm_x_2 = np.linalg.norm(x)

In [191]: normalized_x = x /norm_x_2

In [192]: print "x: \n%s" % x
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]

In [193]: print "Normalized x: \n%s" % normalized_x
Normalized x: 
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812081 0.20941776]

In [194]: print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)
2-Norm of normalized_x: 1.0000

In [199]: vector = sc.parallelize([x])

In [200]: from pyspark.mllib.feature import Normalizer

In [201]: normalizer = Normalizer()

In [202]: vector = sc.parallelize([x])

In [203]: normalized_x_mllib = normalizer.transform(vector).first().toArray()

In [204]: print "x: \n%s" % x
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]

In [205]: print "2-Norm of x: %2.4f" % norm_x_2
2-Norm of x: 2.5908

In [206]: print "Normalized x MLlib: \n%s" % normalized_x_mllib
Normalized x MLlib:
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812081 0.20941776]

In [207]: print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
2-Norm of normalized_x_mllib: 1.0000


