



1 舆情产品基于elasticsearch大数据,es内应用lucene分词,python的jieba分词和lucene分词结果并不一致(或需额外的工作保持一致),早期需求只是展示每日热词,分词不一致并不是个问题,现在的新的需求,要求lda和数据无缝结合,es集成jieba,再把es内的数据全用全量数据重新分词,考虑工作量和技术难度上都不现实,只好改lda的分词算法了(实际应用上,不同的分词算法在lda提取主题和热词的场景下几乎没有影响)

2 python项目限于单点计算,不好扩展





ls examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala

root@wx-social-consume2:/usr/spark-2.4.0# ./bin/run-example mllib.LDAExample
Corpus summary:
Training set size: 12 documents
Vocabulary size: 10 terms
Training set size: 62 tokens
Preprocessing time: 5.008004889 sec

Finished training LDA model. Summary:
Training time: 3.565145577 sec
Training data average log likelihood: -20.14862821928427

20 topics:
0 0.2776616245970018
1 0.2467437437143153
2 0.162799944092254
3 0.14357469330901143
4 0.06928101473026803
9 0.04699050355830519
5 0.030203661449368247
6 0.007640498460509008
7 0.007552251606432597
8 0.007552064482534436

0 0.36469051153508564
1 0.18646127739213272
2 0.15522432783704704
3 0.13106181033451642
4 0.06584596207691033
9 0.04395627892970295
5 0.030062711796482375
8 0.007597985215275549
7 0.00759765588135554
6 0.007501479001491501

0 0.31931887563375516
1 0.1962082806692423
2 0.17231677857837657
3 0.14159208272582544
4 0.07269228847838642
9 0.044804492369582984
5 0.030074171215304125
7 0.007707667829625457
8 0.007707603060281631
6 0.007577759439619957

0 0.33172427138380095
1 0.2099908538476618
2 0.1529315801366396
3 0.13055578109706914
4 0.07654434907166915
9 0.045380994421043694
5 0.03023101958436096
8 0.007550809491464853
7 0.007550747971239613
6 0.00753959299505035

0 0.32678177168416583
1 0.2207731082949569
2 0.1515085115377772
3 0.13531105489561282
4 0.06780498015453475
9 0.04539833136958728
5 0.02983504958532268
6 0.007557895637294794
7 0.007514688233324302
8 0.007514608607423405

0 0.3239146979457089
1 0.21573894993478243
2 0.14525808340440868
3 0.14073654896536467
4 0.07677900991703532
9 0.04501240647651081
5 0.03001971166224193
6 0.007555937737508214
7 0.00749238505224968
8 0.007492268904189478

0 0.31540967245137175
1 0.2455372478674326
2 0.15387618434880607
3 0.11776433445702658
4 0.06958508618823488
9 0.0454694806523595
5 0.029862323423880312
6 0.007550844343998941
7 0.007472470469363186
8 0.007472355797526257

0 0.3288373150630124
1 0.1984831879713159
2 0.14673351329227885
3 0.13721294614635954
4 0.09099655027113343
9 0.04408525595608986
5 0.031035301494144973
8 0.007542328434232405
7 0.007542260756614362
6 0.007531340614818258

0 0.3193586979466173
2 0.19372032290375615
1 0.17385196290359722
3 0.15067297324203144
4 0.06431584983802775
9 0.04459867057873216
5 0.030109171006041484
7 0.007889712095309039
8 0.007889665041634011
6 0.007592974444253261

0 0.30194242995872983
1 0.19005321586219182
2 0.18325896599060398
3 0.13814239333008232
4 0.08613769346253837
9 0.04615261626883708
5 0.031097388101704492
8 0.007811116080942582
7 0.00781095323650845
6 0.007593227707861054

0 0.3018333962608689
1 0.2101327107384649
2 0.17976426989184977
3 0.13028433691427393
4 0.0785628713958075
9 0.04589483819874114
5 0.030473519782385595
8 0.007732641246914153
7 0.007732610576531091
6 0.007588804994162942

0 0.2731844012758353
1 0.2437828502953355
2 0.15602289141674863
3 0.14124037120394928
4 0.08621787369766198
9 0.04595175938704448
5 0.03092529389836489
6 0.007624331986973666
8 0.007525230917852694
7 0.007524995920233587

0 0.2754166406308875
1 0.23790442921451685
2 0.16407059546150252
3 0.15224175554500483
4 0.06911748308147463
9 0.04812626048120002
5 0.030286734901316135
6 0.007658288924620451
8 0.007588920737731278
7 0.0075888910217458685

0 0.30106045484425825
1 0.24631760902734479
3 0.14410465573241527
2 0.12524822006978883
4 0.08639636802254201
9 0.04423754911967344
5 0.030456963732366213
6 0.007563767806759953
7 0.00730722647659661
8 0.0073071851682546506

0 0.31562044416659885
1 0.2526998226062497
2 0.14535277752245734
3 0.12218888456011873
4 0.0662622160602047
9 0.04569769393216447
5 0.02981990651641202
6 0.007553483942629782
7 0.007402409879010876
8 0.00740236081415358

0 0.32058662644832936
1 0.20394629563372832
2 0.14716518634378675
3 0.1428368219219527
4 0.08608013435759047
9 0.046286592753669746
5 0.030458243185553725
6 0.007557253951254248
8 0.007541483951171858
7 0.007541361452962736

0 0.34720661993137836
1 0.19335921917713203
2 0.1574642973682748
3 0.1315805755079897
4 0.07262519571107433
9 0.044460628102199806
5 0.03055492723901837
7 0.0076115120495338995
8 0.007611307835359196
6 0.0075257170780395

0 0.276113881630822
1 0.23014600960245993
2 0.1791490549120877
3 0.13887566068288787
4 0.0755460207226412
9 0.04662339718458672
5 0.03051544813778674
7 0.007695876502113754
8 0.007695757393934347
6 0.00763889323067976

0 0.3143607517778961
1 0.23372981830486098
2 0.15462561577264133
3 0.12748419732486796
4 0.07250495549430869
9 0.04461512010586714
5 0.030101883279393397
6 0.007561984484167818
8 0.007507862285458234
7 0.007507811170538331

0 0.27641365880548574
1 0.25811740868556055
2 0.15554256853273
3 0.1299929833331776
4 0.08206677473861888
9 0.04536961458219285
5 0.029948048909439938
6 0.007602171935254187
7 0.007473418458492894
8 0.007473352019047361


./bin/run-example mllib.LDAExample data/mllib/sample_lda_data.txt --k=3

Corpus summary:
Training set size: 12 documents
Vocabulary size: 10 terms
Training set size: 62 tokens
Preprocessing time: 3.952237148 sec

Finished training LDA model. Summary:
Training time: 2.096613705 sec
Training data average log likelihood: -19.94356723200465

3 topics:
0 0.3859910534194794
1 0.27784612698328287
3 0.11309885992982183
2 0.07754302744583665
4 0.05944898611629052
9 0.03956337141492645
5 0.026961271416303733
6 0.00729179016688132
7 0.006153234509962162
8 0.006102278597215016

0 0.27760966762034767
2 0.20133458026264603
1 0.17674769647471528
3 0.17119003866520263
4 0.05856966790108555
9 0.054142861315096644
5 0.036444242827270906
6 0.008131736037023057
7 0.008064284551869805
8 0.007765224344742546

0 0.26781477963691924
1 0.20419276965972555
2 0.1988283635339889
3 0.12491741400170082
4 0.10934994319103938
9 0.0426866734196486
5 0.027519743427499636
8 0.008867804538854353
7 0.008517400571616986
6 0.007305108019006569

root@wx-social-consume2:/usr/spark-2.4.0# cat data/mllib/sample_lda_data.txt
1 2 6 0 2 3 1 1 0 0 3
1 3 0 1 3 0 0 2 0 0 1
1 4 1 0 0 4 9 0 1 2 0
2 1 0 3 0 0 5 0 2 3 9
3 1 1 9 3 0 2 0 0 1 3
4 2 0 3 4 5 1 1 1 4 0
2 1 0 3 0 0 5 0 2 2 9
1 1 1 9 2 1 2 0 0 1 3
4 4 0 3 4 2 1 3 0 0 0
2 8 2 0 3 0 2 0 2 7 2
1 1 1 9 0 2 2 0 0 3 3
4 1 0 0 4 5 1 3 0 1 0

0 0.26781477963691924
1 0.20419276965972555
2 0.1988283635339889
3 0.12491741400170082
4 0.10934994319103938
9 0.0426866734196486
5 0.027519743427499636
8 0.008867804538854353
7 0.008517400571616986
6 0.007305108019006569

val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)


