1. import os
  2. import sys
  3. spark_name = os.environ.get('SPARK_HOME',None)
  4. if not spark_name:
  5. raise ValueErrorError('spark环境没有配置好')
  6. sys.path.insert(0,os.path.join(spark_name,'python'))
  7. sys.path.insert(0,os.path.join(spark_name,'python/lib/py4j-0.10.4-src.zip'))
  1. import pyspark
  2. from pyspark import SparkConf
  3. from pyspark.context import SparkContext
  4. from pyspark.sql import SparkSession, SQLContext
  5. sc = SparkContext(appName="MyProject")

yarn 模式报No module named pyspark

  1. ## 初始化 SparkContext
  2. from pyspark import SparkConf,SparkContext
  3. conf = (SparkConf()
  4. .setMaster('local[20]')
  5. .setAppName('my spark app')
  6. .set("spark.executor.memory",'10g'))
  7. sc = SparkContext(conf=conf)
  1. # 或者
  2. sc = SparkContext(appName="my spark app",master='local')
  1. ## 打印 SparkContext 相关信息
  2. print(sc.version)
  3. print(sc.pythonVer)
  4. print(sc.master)
  5. print(sc.sparkHome)
  6. print(sc.sparkUser())
  7. print(sc.appName)
  8. print(sc.applicationId)
  9. print(sc.defaultParallelism)
  10. print(sc.defaultMinPartitions)

RDD 操作

查看基础统计信息

  1. rdd = sc.parallelize([('a',1),('b',4),('a',5)])
  2. print(rdd.getNumPartitions())
  3. print(rdd.count())
  4. print(rdd.countByKey())
  5. print(rdd.countByValue())
  6. print(rdd.collectAsMap())
  1. 1
  2. 3
  3. defaultdict(<class 'int'>, {'a': 2, 'b': 1})
  4. defaultdict(<class 'int'>, {('a', 1): 1, ('b', 4): 1, ('a', 5): 1})
  5. {'a': 5, 'b': 4}
  1. ## 对一维RDD
  2. rdd = sc.parallelize([random.randint(0,20) for i in range(10)])
  3. print(rdd.collect())
  4. print(sorted(rdd.collect()))
  5. print(rdd.max())
  6. print(rdd.min())
  7. print(rdd.mean())
  8. print(rdd.stdev())
  9. print(rdd.stats())
  10. print(rdd.histogram(3))
  1. [20, 3, 7, 19, 7, 20, 20, 5, 16, 2]
  2. [2, 3, 5, 7, 7, 16, 19, 20, 20, 20]
  3. 20
  4. 2
  5. 11.9
  6. 7.327346040688948
  7. (count: 10, mean: 11.9, stdev: 7.327346040688948, max: 20.0, min: 2.0)
  8. ([2, 8, 14, 20], [5, 0, 5])

应用函数

  1. import random
  2. file_name = path + 'data_random'
  3. data_random = [[random.randint(0,20) for j in range(5)] for i in range(3)]
  4. rdd = sc.parallelize(data_random)
  5. rdd1 = rdd.map(lambda x:sorted(x))
  6. print(rdd1.collect())
  7. rdd2 = rdd.flatMap(lambda x:sorted(x))
  8. print(rdd2.collect())
  1. [[0, 5, 8, 13, 20], [0, 3, 3, 9, 11], [3, 4, 7, 11, 17]]
  2. [0, 5, 8, 13, 20, 0, 3, 3, 9, 11, 3, 4, 7, 11, 17]
  1. rdd = sc.parallelize([('a',[3,4,5,6]),('b',[11,22,33,44]),('a',[1111,2222])]) ## 注意这里是tuple()
  2. rdd1 = rdd.mapValues(lambda x:len(x))
  3. print(rdd1.collect())
  4. print(rdd.reduceByKey(lambda x,y:x+y).collect())
  5. # groupByKey():应用于(K,V)键值对的数据集时,返回一个新的(K, Iterable)形式的数据集
  6. print(rdd.reduce(lambda x,y:x+y)) ## tuple的加法
  1. [('a', 4), ('b', 4), ('a', 2)]
  2. [('a', [3, 4, 5, 6, 1111, 2222]), ('b', [11, 22, 33, 44])]
  3. ('a', [3, 4, 5, 6], 'b', [11, 22, 33, 44], 'a', [1111, 2222])
  1. rdd = sc.parallelize([i for i in range(10)])
  2. print(rdd.collect())
  3. print(rdd.reduce(lambda x,y:x+y))
  1. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
  2. 45
  1. rdd = sc.parallelize([i for i in range(5)])
  2. print(rdd.collect())
  3. print(rdd.keyBy(lambda x:chr(ord('a')+x)).collect()) ##给每个元素添加key
  1. [0, 1, 2, 3, 4]
  2. [('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4)]

选择数据

  1. data_random = [[random.randint(0,20) for j in range(5)] for i in range(3)]
  2. rdd = sc.parallelize(data_random)
  3. print(rdd.collect())
  4. print(rdd.first()) ## 返回第一个元素
  5. print(rdd.take(2)) ## 以list形式返回前俩个元素
  6. print(rdd.top(2))
  1. [[2, 10, 8, 15, 8], [7, 5, 6, 17, 0], [18, 17, 7, 9, 13]]
  2. [2, 10, 8, 15, 8]
  3. [[2, 10, 8, 15, 8], [7, 5, 6, 17, 0]]
  4. [[18, 17, 7, 9, 13], [7, 5, 6, 17, 0]]
  1. ## 抽样
  2. data_random = [[random.randint(0,20) for j in range(5)] for i in range(100)]
  3. rdd = sc.parallelize(data_random)
  4. print(rdd.count())
  5. rdd_sample = rdd.sample(withReplacement=False,fraction=0.3,seed=123)
  6. print(rdd_sample.count())
  1. 100
  2. 31
  1. ## 过滤(返回满足条件的元素)
  2. rdd = sc.parallelize([random.randint(0,10) for i in range(20)])
  3. print(rdd.collect())
  4. rdd_filtered = rdd.filter(lambda x:x>5)
  5. print(rdd_filtered.collect())
  1. [7, 10, 0, 4, 1, 5, 7, 2, 4, 2, 10, 2, 9, 10, 6, 5, 8, 0, 5, 2]
  2. [7, 10, 7, 10, 9, 10, 6, 8]
  1. ## 得到不同元素
  2. rdd = sc.parallelize([1,2,3,4,5,6,1,2,3,4,5])
  3. print(rdd.collect())
  4. print(rdd.distinct().collect())
  1. [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5]
  2. [1, 2, 3, 4, 5, 6]
  1. rdd = sc.parallelize([('a',[3,4,5,6]),('b',[11,22,33,44]),('a',[1111,2222])])
  2. print(rdd.keys().collect())
  3. print(rdd.values().collect())
  1. ['a', 'b', 'a']
  2. [[3, 4, 5, 6], [11, 22, 33, 44], [1111, 2222]]

聚合

  1. rdd = sc.parallelize([i for i in range(50)])
  2. rdd_groupby = rdd.groupBy(lambda x: x % 3)
  3. print(rdd_groupby.collect())
  4. print(rdd_groupby.mapValues(list).collect())
  5. print(rdd_groupby.map(lambda x:(x[0],list(x[1]))).collect())
  1. [(0, <pyspark.resultiterable.ResultIterable object at 0x7f2359670be0>), (1, <pyspark.resultiterable.ResultIterable object at 0x7f2359670a58>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f2359717be0>)]
  2. [(0, [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]), (1, [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]), (2, [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47])]
  3. [(0, [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]), (1, [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]), (2, [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47])]
  1. from operator import add
  2. sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
  1. 15
  1. ## 对分区分别处理
  2. sc.parallelize([1,2,3,4]).aggregate(
  3. (0, 0),
  4. (lambda acc, value: (acc[0] + value, acc[1] + 1)),
  5. (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))
  1. (10, 4)
  1. list(range(10))
  1. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
  1. ## 集合运算
  2. rdd1 = sc.parallelize(list(range(10)))
  3. rdd2 = sc.parallelize(list(range(5,15)))
  4. rdd3 = rdd2.subtract(rdd1) ## 返回rdd2和rdd1的差集
  5. print(rdd3.collect())
  1. [10, 12, 14, 11, 13]
  1. rdd1 = sc.parallelize([('a',1),('b',2),('c',3)])
  2. rdd2 = sc.parallelize([('a',1),('b',2),('d',4)])
  3. rdd3 = rdd2.subtractByKey(rdd1) ## 根据key计算差集
  4. print(rdd3.collect())
  1. [('d', 4)]
  1. rdd1 = sc.parallelize(list(range(3)))
  2. rdd2 = sc.parallelize(list(range(5,8)))
  3. rdd3 = rdd1.cartesian(rdd2) ## 笛卡尔积
  4. print(rdd3.collect())
  1. [(0, 5), (0, 6), (0, 7), (1, 5), (1, 6), (1, 7), (2, 5), (2, 6), (2, 7)]
  1. ##将rdd重新分区
  2. rdd = rdd.repartition(4) # 需要重新赋值
  3. rdd = rdd.coalesce(2) ## # 需要重新赋值
  4. print(rdd.getNumPartitions())
  1. 2
  1. ## 保存
  2. rdd.saveAsTextFile(path)
  3. rdd.saveAsHadoopFile(path)
  4. ...

DataFrame

  1. from pyspark.sql import SparkSession
  2. spark = SparkSession(sc)
  1. ## 从rdd创建df
  2. from pyspark.sql.types import *
  3. from pyspark.sql import Row
  4. ##从rdd推断schema
  5. rdd = sc.parallelize([[i,chr(i+97)] for i in range(5)])
  6. print(rdd.collect())
  7. rdd = rdd.map(lambda x:Row(No=x[0],char=x[1]))
  8. print(rdd.collect())
  9. df = spark.createDataFrame(rdd)
  10. df.show()
  1. [[0, 'a'], [1, 'b'], [2, 'c'], [3, 'd'], [4, 'e']]
  2. [Row(No=0, char='a'), Row(No=1, char='b'), Row(No=2, char='c'), Row(No=3, char='d'), Row(No=4, char='e')]
  3. +---+----+
  4. | No|char|
  5. +---+----+
  6. | 0| a|
  7. | 1| b|
  8. | 2| c|
  9. | 3| d|
  10. | 4| e|
  11. +---+----+
  1. ## 指定schema
  2. rdd = sc.parallelize([Row(No=i,char=chr(i+97)) for i in range(5)])
  3. print(rdd.collect())
  4. schema_list = ['No','char']
  5. fields = [StructField(name='No',dataType=IntegerType(),nullable=True),
  6. StructField(name='char',dataType=StringType(),nullable =True)]
  7. schema = StructType(fields)
  8. df = spark.createDataFrame(rdd)
  9. df.show()
  1. [Row(No=0, char='a'), Row(No=1, char='b'), Row(No=2, char='c'), Row(No=3, char='d'), Row(No=4, char='e')]
  2. +---+----+
  3. | No|char|
  4. +---+----+
  5. | 0| a|
  6. | 1| b|
  7. | 2| c|
  8. | 3| d|
  9. | 4| e|
  10. +---+----+
  1. ##保存dataframe到文件
  2. df.select('*').write.save(path+'df.parquet')
  3. df.select('*').write.save(path+'df.json',format='json')
  1. ##从spark文件读入
  2. df=spark.read.load(path+"df.parquet")
  3. print(df.show())
  4. df=spark.read.load(path+"df.json",format='json')
  5. print(df.show())
  6. df=spark.read.json(path+'df.json')
  7. print(df.show())
  1. +---+----+
  2. | No|char|
  3. +---+----+
  4. | 0| a|
  5. | 1| b|
  6. | 2| c|
  7. | 3| d|
  8. | 4| e|
  9. +---+----+
  10. None
  11. +---+----+
  12. | No|char|
  13. +---+----+
  14. | 0| a|
  15. | 1| b|
  16. | 2| c|
  17. | 3| d|
  18. | 4| e|
  19. +---+----+
  20. None
  21. +---+----+
  22. | No|char|
  23. +---+----+
  24. | 0| a|
  25. | 1| b|
  26. | 2| c|
  27. | 3| d|
  28. | 4| e|
  29. +---+----+
  30. None
  1. ## 检查dataframe数据
  2. print(df.dtypes)
  3. print(df.schema)
  4. print(df.show())
  5. print(df.first())
  6. print(df.take(1))
  7. print(df.describe().show())
  1. [('No', 'bigint'), ('char', 'string')]
  2. StructType(List(StructField(No,LongType,true),StructField(char,StringType,true)))
  3. +---+----+
  4. | No|char|
  5. +---+----+
  6. | 0| a|
  7. | 1| b|
  8. | 2| c|
  9. | 3| d|
  10. | 4| e|
  11. +---+----+
  12. None
  13. Row(No=0, char='a')
  14. [Row(No=0, char='a')]
  15. +-------+------------------+----+
  16. |summary| No|char|
  17. +-------+------------------+----+
  18. | count| 5| 5|
  19. | mean| 2.0|null|
  20. | stddev|1.5811388300841898|null|
  21. | min| 0| a|
  22. | max| 4| e|
  23. +-------+------------------+----+
  24. None
  1. print(df.columns)
  2. print(df.count())
  3. print(df.distinct().count())
  4. print(df.printSchema())
  5. print(df.explain()) ##
  1. ['No', 'char']
  2. 5
  3. 5
  4. root
  5. |-- No: long (nullable = true)
  6. |-- char: string (nullable = true)
  7. None
  8. == Physical Plan ==
  9. *FileScan json [No#1051L,char#1052] Batched: false, Format: JSON, Location: InMemoryFileIndex[afs://baihua.afs.baidu.com:9902/user/fpd_dm/tangshengyu/spark/df.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<No:bigint,char:string>
  10. None

dataframe查询

  1. ## select
  2. from pyspark.sql import functions as F
  3. rdd = sc.parallelize([Row(No=1,char='a',values=[1,2]),Row(No=2,char='b',values=[1,4])])
  4. df = spark.createDataFrame(rdd)
  5. print(df.select('No').show())
  6. print(df.select('No','char',F.explode('values').alias('value')).show()) ## 将聚合的列展开
  7. print(df.select(df['No']+1,df['char']).show()) ## 对整列进行运算
  1. +---+
  2. | No|
  3. +---+
  4. | 1|
  5. | 2|
  6. +---+
  7. None
  8. +---+----+-----+
  9. | No|char|value|
  10. +---+----+-----+
  11. | 1| a| 1|
  12. | 1| a| 2|
  13. | 2| b| 1|
  14. | 2| b| 4|
  15. +---+----+-----+
  16. None
  17. +--------+----+
  18. |(No + 1)|char|
  19. +--------+----+
  20. | 2| a|
  21. | 3| b|
  22. +--------+----+
  23. None
  1. print(df.select(df['No']>1).show()) ## 条件查询
  2. print(df.select('No','char',F.when(df.char=='a',1).otherwise(0).alias('is a')).show()) ## 根据已有列产生新列
  3. print(df[df.char.isin('a')].show()) ##条件查询
  4. print(df.select('No',df.char.like('a')).show()) ## 根据已有列产生新列
  1. +--------+
  2. |(No > 1)|
  3. +--------+
  4. | false|
  5. | true|
  6. +--------+
  7. None
  8. +---+----+----+
  9. | No|char|is a|
  10. +---+----+----+
  11. | 1| a| 1|
  12. | 2| b| 0|
  13. +---+----+----+
  14. None
  15. +---+----+------+
  16. | No|char|values|
  17. +---+----+------+
  18. | 1| a|[1, 2]|
  19. +---+----+------+
  20. None
  21. +---+-----------+
  22. | No|char LIKE a|
  23. +---+-----------+
  24. | 1| true|
  25. | 2| false|
  26. +---+-----------+
  27. None
  1. print(df.select('No',df.char.startswith('a')).show()) ## 以指定字符开始
  2. print(df.select('No',df.char.endswith('b')).show())
  1. +---+-------------------+
  2. | No|startswith(char, a)|
  3. +---+-------------------+
  4. | 1| true|
  5. | 2| false|
  6. +---+-------------------+
  7. None
  8. +---+-----------------+
  9. | No|endswith(char, b)|
  10. +---+-----------------+
  11. | 1| false|
  12. | 2| true|
  13. +---+-----------------+
  14. None
  1. rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),Row(No=2,char='bbbbb',values=[1,4])])
  2. df = spark.createDataFrame(rdd)
  3. print(df.select('No',df.char.substr(1,3)).show())
  4. print(df.select(df.No.between(0,1)).show())
  1. +---+---------------------+
  2. | No|substring(char, 1, 3)|
  3. +---+---------------------+
  4. | 1| aaa|
  5. | 2| bbb|
  6. +---+---------------------+
  7. None
  8. +-------------------------+
  9. |((No >= 0) AND (No <= 1))|
  10. +-------------------------+
  11. | true|
  12. | false|
  13. +-------------------------+
  14. None
  1. ## 增加列
  2. df = df.withColumn('new_col',df.char)
  3. print(df.show())
  4. df = df.withColumnRenamed('new_col','renamed_col')
  5. print(df.show())
  1. +---+-----+------+-------+
  2. | No| char|values|new_col|
  3. +---+-----+------+-------+
  4. | 1|aaaaa|[1, 2]| aaaaa|
  5. | 2|bbbbb|[1, 4]| bbbbb|
  6. +---+-----+------+-------+
  7. None
  8. +---+-----+------+-----------+
  9. | No| char|values|renamed_col|
  10. +---+-----+------+-----------+
  11. | 1|aaaaa|[1, 2]| aaaaa|
  12. | 2|bbbbb|[1, 4]| bbbbb|
  13. +---+-----+------+-----------+
  14. None
  1. ## 删除列
  2. df = df.drop("renamed_col")
  3. print(df.show())
  4. df = df.drop(df.values)
  5. print(df.show())
  1. +---+-----+------+
  2. | No| char|values|
  3. +---+-----+------+
  4. | 1|aaaaa|[1, 2]|
  5. | 2|bbbbb|[1, 4]|
  6. +---+-----+------+
  7. None
  8. +---+-----+
  9. | No| char|
  10. +---+-----+
  11. | 1|aaaaa|
  12. | 2|bbbbb|
  13. +---+-----+
  14. None
  1. ## groupby
  2. rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),
  3. Row(No=2,char='bbbbb',values=[1,4]),
  4. Row(No=1,char='ccccc',values=[3,5])])
  5. df = spark.createDataFrame(rdd)
  6. print(df.groupBy('No').count().show()) # 区别?
  7. print(df.groupby('No').count().show()) # 区别?
  1. +---+-----+
  2. | No|count|
  3. +---+-----+
  4. | 1| 2|
  5. | 2| 1|
  6. +---+-----+
  7. None
  8. +---+-----+
  9. | No|count|
  10. +---+-----+
  11. | 1| 2|
  12. | 2| 1|
  13. +---+-----+
  14. None
  1. print(df.filter(df.No>1).show())
  1. +---+-----+------+
  2. | No| char|values|
  3. +---+-----+------+
  4. | 2|bbbbb|[1, 4]|
  5. +---+-----+------+
  6. None
  1. ## sort
  2. print(df.sort(df.No.desc()).show()) ## 降序, 升序是asc
  3. print(df.orderBy(['No','char'],asciiending=[1,1]).show()) ##
  1. +---+-----+------+
  2. | No| char|values|
  3. +---+-----+------+
  4. | 2|bbbbb|[1, 4]|
  5. | 1|aaaaa|[1, 2]|
  6. | 1|ccccc|[3, 5]|
  7. +---+-----+------+
  8. None
  9. +---+-----+------+
  10. | No| char|values|
  11. +---+-----+------+
  12. | 1|aaaaa|[1, 2]|
  13. | 1|ccccc|[3, 5]|
  14. | 2|bbbbb|[1, 4]|
  15. +---+-----+------+
  16. None
  1. ## 值替换和缺失值填充
  2. rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),
  3. Row(No=2,char='bbbbb',values=[1,4]),
  4. Row(No=1,char='ccccc',values=[3,5])])
  5. df = spark.createDataFrame(rdd)
  6. print(df.show())
  7. print(df.na.fill('eeee').show())
  8. print(df.na.drop().show()) ## 删除有空值的行
  9. print(df.na.replace(1,10).show()) ## 对指定值进行替换
  1. +---+-----+------+
  2. | No| char|values|
  3. +---+-----+------+
  4. | 1|aaaaa|[1, 2]|
  5. | 2|bbbbb|[1, 4]|
  6. | 1|ccccc|[3, 5]|
  7. +---+-----+------+
  8. None
  9. +---+-----+------+
  10. | No| char|values|
  11. +---+-----+------+
  12. | 1|aaaaa|[1, 2]|
  13. | 2|bbbbb|[1, 4]|
  14. | 1|ccccc|[3, 5]|
  15. +---+-----+------+
  16. None
  17. +---+-----+------+
  18. | No| char|values|
  19. +---+-----+------+
  20. | 1|aaaaa|[1, 2]|
  21. | 2|bbbbb|[1, 4]|
  22. | 1|ccccc|[3, 5]|
  23. +---+-----+------+
  24. None
  25. +---+-----+------+
  26. | No| char|values|
  27. +---+-----+------+
  28. | 10|aaaaa|[1, 2]|
  29. | 2|bbbbb|[1, 4]|
  30. | 10|ccccc|[3, 5]|
  31. +---+-----+------+
  32. None
  1. ## 分区相关
  2. print(df.repartition(3).rdd.getNumPartitions())
  3. print(df.coalesce(1).rdd.getNumPartitions())
  1. 3
  2. 1
  1. ## 用SQL进行操作
  2. ## 建立视图
  3. #df.createGlobalTempView('demo') #across spark sessions
  4. df.createOrReplaceTempView('demo') # spark session lifetime
  5. #df.createTempView('demo')
  6. spark.sql("select * from demo").show()
  1. +---+-----+------+
  2. | No| char|values|
  3. +---+-----+------+
  4. | 1|aaaaa|[1, 2]|
  5. | 2|bbbbb|[1, 4]|
  6. | 1|ccccc|[3, 5]|
  7. +---+-----+------+
  1. ## 转其它数据
  2. df.toJSON()
  3. df.toPandas()
  4. df.rdd.take(1)
  1. # 终止会话
  2. spark.stop()

pipline for model

pyspark 记录的更多相关文章

  1. Spark PySpark数据类型的转换原理—Writable Converter

    Spark目前支持三种开发语言:Scala.Java.Python,目前我们大量使用Python来开发Spark App(Spark 1.2开始支持使用Python开发Spark Streaming ...

  2. pyspark 内容介绍(一)

    pyspark 包介绍 子包 pyspark.sql module pyspark.streaming module pyspark.ml package pyspark.mllib package ...

  3. 《Spark Python API 官方文档中文版》 之 pyspark.sql (一)

    摘要:在Spark开发中,由于需要用Python实现,发现API与Scala的略有不同,而Python API的中文资料相对很少.每次去查英文版API的说明相对比较慢,还是中文版比较容易get到所需, ...

  4. 《Spark Python API 官方文档中文版》 之 pyspark.sql (二)

    摘要:在Spark开发中,由于需要用Python实现,发现API与Scala的略有不同,而Python API的中文资料相对很少.每次去查英文版API的说明相对比较慢,还是中文版比较容易get到所需, ...

  5. Spark核心类:弹性分布式数据集RDD及其转换和操作pyspark.RDD

    http://blog.csdn.net/pipisorry/article/details/53257188 弹性分布式数据集RDD(Resilient Distributed Dataset) 术 ...

  6. 项目实战-使用PySpark处理文本多分类问题

    原文链接:https://cloud.tencent.com/developer/article/1096712 在大神创作的基础上,学习了一些新知识,并加以注释. TARGET:将旧金山犯罪记录(S ...

  7. Ubuntu下导入PySpark到Shell和Pycharm中(未整理)

    实习后面需要用到spark,虽然之前跟了edX的spark的课程以及用spark进行machine learning,但那个环境是官方已经搭建好的,但要在自己的系统里将PySpark导入shell(或 ...

  8. Spark记录-实例和运行在Yarn

    #运行实例 #./bin/run-example SparkPi 10   #./bin/spark-shell --master local[2] #./bin/pyspark --master l ...

  9. Spark记录-官网学习配置篇(二)

    ### Spark SQL Running the SET -v command will show the entire list of the SQL configuration. #scala/ ...

随机推荐

  1. Theia APIs——Preferences

    上一篇:Theia APIs——命令和快捷键 Preferences Theia有一个preference service,模块可以通过它来获取preference的值,提供默认的preference ...

  2. 彻底掌握CORS跨源资源共享

    本文来自于公众号链接: 彻底掌握CORS跨源资源共享 ) 本文接上篇公众号文章:彻底理解浏览器同源策略SOP 一.概述 在云时代,各种SAAS应用层出不穷,各种互联网API接口越来越丰富,H5技术在微 ...

  3. 由数据迁移至MongoDB导致的数据不一致问题及解决方案

    故事背景 企业现状 2019年年初,我接到了一个神秘电话,电话那头竟然准确的说出了我的昵称:上海小胖. 我想这事情不简单,就回了句:您好,我是小胖,请问您是? "我就是刚刚加了你微信的 xx ...

  4. 比特币学习笔记(一)---在windows下编译搭建比特币环境

    最近打算研究下比特币源码,却发现这套源码正常情况下得在linux下编译运行,而我的机器是windows的. 怎么办呢? 起初打算用mingw和cygwin搞搞看,试了许久后发现行不通,必须转到linu ...

  5. 条款03:尽可能使用const

    目录 1. 总结 2. const对象 3. const函数返回值和函数参数 4. const成员函数 const成员函数的重要性 bitwise constness logical constnes ...

  6. Docker系列-第五篇Docker容器数据卷

    1.是什么 在生产环境中使用 Docker,往往需要对数据进行持久化,或者需要在多个容器之间进行数据共享,这必然涉及容器的数据管理操作 . 容器中的管理数据主要有两种方式 : 数据卷 ( Data V ...

  7. 快速回顾MySQL:汇总和分组

    10.3 汇总数据 我们经常需要汇总数据而不用把它们实际检索处出来,为此MySQL提供了专门的函数.使用这些函数,MySQL查询可用于检索数据,以便分析和报表的生成.这种类型的检索例子有以下几种: 确 ...

  8. DevExpress作为企业赞助商加入.NET基金会

    .NET基金会是一个独立的非营利组织,于2014年成立,旨在围绕 .NET 不断增长的开源技术集合,促进开放开发和协作.它是商业和社区开发人员的论坛,通过促进开放性,社区参与和快速创新来增强.NET生 ...

  9. 「 Android开发 」开启第一个App应用

    每天进步一丢丢,连接梦与想 无论什么时候,永远不要以为自己知道一切   -巴普洛夫 最近玩了下Android,但遇到了一些坑,浪费了很多的时间,在此记录一下,你若是遇到了就知道怎么解决了 PS:建议使 ...

  10. 2015年3月26日 - Javascript MVC 框架DerbyJS DerbyJS 是一个 MVC 框架,帮助编写实时,交互的应用。

    2015年3月26日 -  Javascript MVC 框架DerbyJS DerbyJS 是一个 MVC 框架,帮助编写实时,交互的应用.