Coursera课程笔记----P4E.Capstone----Week 4&5
Spidering and Modeling Email Data(week4&5)
Mailing List - Gmane
- Crawl the archive of a mailing list
- Do some analysis / cleanup
- Visualize the data as word cloud and lines
code segment
gmane.py
import sqlite3
import time
import ssl
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from urllib.parse import urlparse
import re
from datetime import datetime, timedelta
# Not all systems have this so conditionally define parser
try:
import dateutil.parser as parser
except:
pass
def parsemaildate(md) :
# See if we have dateutil
try:
pdate = parser.parse(tdate)
test_at = pdate.isoformat()
return test_at
except:
pass
# Non-dateutil version - we try our best
pieces = md.split()
notz = " ".join(pieces[:4]).strip()
# Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue
if dnotz is None :
# print 'Bad Date:',md
return None
iso = dnotz.isoformat()
tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000' : tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh+":"+tzm
except:
pass
return iso+tz
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
conn = sqlite3.connect('content.sqlite')
cur = conn.cursor()
baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
cur.execute('''CREATE TABLE IF NOT EXISTS Messages
(id INTEGER UNIQUE, email TEXT, sent_at TEXT,
subject TEXT, headers TEXT, body TEXT)''')
# Pick up where we left off
start = None
cur.execute('SELECT max(id) FROM Messages' )
try:
row = cur.fetchone()
if row is None :
start = 0
else:
start = row[0]
except:
start = 0
if start is None : start = 0
many = 0
count = 0
fail = 0
while True:
if ( many < 1 ) :
sval = input('How many messages:')
if ( len(sval) < 1 ) : break
many = int(sval)
start = start + 1
cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
try:
row = cur.fetchone()
if row is not None : continue
except:
row = None
many = many - 1
url = baseurl + str(start) + '/' + str(start + 1)
text = "None"
try:
# Open with a timeout of 30 seconds
document = urllib.request.urlopen(url, None, 30, context=ctx)
text = document.read().decode()
if document.getcode() != 200 :
print("Error code=",document.getcode(), url)
break
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except Exception as e:
print("Unable to retrieve or parse page",url)
print("Error",e)
fail = fail + 1
if fail > 5 : break
continue
print(url,len(text))
count = count + 1
if not text.startswith("From "):
print(text)
print("Did not find From ")
fail = fail + 1
if fail > 5 : break
continue
pos = text.find("\n\n")
if pos > 0 :
hdr = text[:pos]
body = text[pos+2:]
else:
print(text)
print("Could not find break between headers and body")
fail = fail + 1
if fail > 5 : break
continue
email = None
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
if len(x) == 1 :
email = x[0];
email = email.strip().lower()
email = email.replace("<","")
else:
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
if len(x) == 1 :
email = x[0];
email = email.strip().lower()
email = email.replace("<","")
date = None
y = re.findall('\Date: .*, (.*)\n', hdr)
if len(y) == 1 :
tdate = y[0]
tdate = tdate[:26]
try:
sent_at = parsemaildate(tdate)
except:
print(text)
print("Parse fail",tdate)
fail = fail + 1
if fail > 5 : break
continue
subject = None
z = re.findall('\Subject: (.*)\n', hdr)
if len(z) == 1 : subject = z[0].strip().lower();
# Reset the fail counter
fail = 0
print(" ",email,sent_at,subject)
cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
if count % 50 == 0 : conn.commit()
if count % 100 == 0 : time.sleep(1)
conn.commit()
cur.close()
gmodel.py
import sqlite3
import time
import re
import zlib
from datetime import datetime, timedelta
# Not all systems have this
try:
import dateutil.parser as parser
except:
pass
dnsmapping = dict()
mapping = dict()
def fixsender(sender,allsenders=None) :
global dnsmapping
global mapping
if sender is None : return None
sender = sender.strip().lower()
sender = sender.replace('<','').replace('>','')
# Check if we have a hacked gmane.org from address
if allsenders is not None and sender.endswith('gmane.org') :
pieces = sender.split('-')
realsender = None
for s in allsenders:
if s.startswith(pieces[0]) :
realsender = sender
sender = s
# print(realsender, sender)
break
if realsender is None :
for s in mapping:
if s.startswith(pieces[0]) :
realsender = sender
sender = mapping[s]
# print(realsender, sender)
break
if realsender is None : sender = pieces[0]
mpieces = sender.split("@")
if len(mpieces) != 2 : return sender
dns = mpieces[1]
x = dns
pieces = dns.split(".")
if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
dns = ".".join(pieces[-2:])
else:
dns = ".".join(pieces[-3:])
# if dns != x : print(x,dns)
# if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns))
dns = dnsmapping.get(dns,dns)
return mpieces[0] + '@' + dns
def parsemaildate(md) :
# See if we have dateutil
try:
pdate = parser.parse(md)
test_at = pdate.isoformat()
return test_at
except:
pass
# Non-dateutil version - we try our best
pieces = md.split()
notz = " ".join(pieces[:4]).strip()
# Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue
if dnotz is None :
# print('Bad Date:',md)
return None
iso = dnotz.isoformat()
tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000' : tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh+":"+tzm
except:
pass
return iso+tz
# Parse out the info...
def parseheader(hdr, allsenders=None):
if hdr is None or len(hdr) < 1 : return None
sender = None
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
if len(x) >= 1 :
sender = x[0]
else:
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
if len(x) >= 1 :
sender = x[0]
# normalize the domain name of Email addresses
sender = fixsender(sender, allsenders)
date = None
y = re.findall('\nDate: .*, (.*)\n', hdr)
sent_at = None
if len(y) >= 1 :
tdate = y[0]
tdate = tdate[:26]
try:
sent_at = parsemaildate(tdate)
except Exception as e:
# print('Date ignored ',tdate, e)
return None
subject = None
z = re.findall('\nSubject: (.*)\n', hdr)
if len(z) >= 1 : subject = z[0].strip().lower()
guid = None
z = re.findall('\nMessage-ID: (.*)\n', hdr)
if len(z) >= 1 : guid = z[0].strip().lower()
if sender is None or sent_at is None or subject is None or guid is None :
return None
return (guid, sender, subject, sent_at)
conn = sqlite3.connect('index.sqlite')
cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS Messages ''')
cur.execute('''DROP TABLE IF EXISTS Senders ''')
cur.execute('''DROP TABLE IF EXISTS Subjects ''')
cur.execute('''DROP TABLE IF EXISTS Replies ''')
cur.execute('''CREATE TABLE IF NOT EXISTS Messages
(id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
sender_id INTEGER, subject_id INTEGER,
headers BLOB, body BLOB)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Senders
(id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
(id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Replies
(from_id INTEGER, to_id INTEGER)''')
conn_1 = sqlite3.connect('mapping.sqlite')
cur_1 = conn_1.cursor()
cur_1.execute('''SELECT old,new FROM DNSMapping''')
for message_row in cur_1 :
dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
mapping = dict()
cur_1.execute('''SELECT old,new FROM Mapping''')
for message_row in cur_1 :
old = fixsender(message_row[0])
new = fixsender(message_row[1])
mapping[old] = fixsender(new)
# Done with mapping.sqlite
conn_1.close()
# Open the main content (Read only)
conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True)
cur_1 = conn_1.cursor()
allsenders = list()
cur_1.execute('''SELECT email FROM Messages''')
for message_row in cur_1 :
sender = fixsender(message_row[0])
if sender is None : continue
if 'gmane.org' in sender : continue
if sender in allsenders: continue
allsenders.append(sender)
print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping))
cur_1.execute('''SELECT headers, body, sent_at
FROM Messages ORDER BY sent_at''')
senders = dict()
subjects = dict()
guids = dict()
count = 0
for message_row in cur_1 :
hdr = message_row[0]
parsed = parseheader(hdr, allsenders)
if parsed is None: continue
(guid, sender, subject, sent_at) = parsed
# Apply the sender mapping
sender = mapping.get(sender,sender)
count = count + 1
if count % 250 == 1 : print(count,sent_at, sender)
# print(guid, sender, subject, sent_at)
if 'gmane.org' in sender:
print("Error in sender ===", sender)
sender_id = senders.get(sender,None)
subject_id = subjects.get(subject,None)
guid_id = guids.get(guid,None)
if sender_id is None :
cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
conn.commit()
cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
try:
row = cur.fetchone()
sender_id = row[0]
senders[sender] = sender_id
except:
print('Could not retrieve sender id',sender)
break
if subject_id is None :
cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
conn.commit()
cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
try:
row = cur.fetchone()
subject_id = row[0]
subjects[subject] = subject_id
except:
print('Could not retrieve subject id',subject)
break
# print(sender_id, subject_id)
cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
( guid, sender_id, subject_id, sent_at,
zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) )
conn.commit()
cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
try:
row = cur.fetchone()
message_id = row[0]
guids[guid] = message_id
except:
print('Could not retrieve guid id',guid)
break
cur.close()
cur_1.close()
gbasic.py
import sqlite3
import time
import zlib
howmany = int(input("How many to dump? "))
conn = sqlite3.connect('index.sqlite')
cur = conn.cursor()
cur.execute('SELECT id, sender FROM Senders')
senders = dict()
for message_row in cur :
senders[message_row[0]] = message_row[1]
cur.execute('SELECT id, subject FROM Subjects')
subjects = dict()
for message_row in cur :
subjects[message_row[0]] = message_row[1]
# cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
messages = dict()
for message_row in cur :
messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4])
print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders))
sendcounts = dict()
sendorgs = dict()
for (message_id, message) in list(messages.items()):
sender = message[1]
sendcounts[sender] = sendcounts.get(sender,0) + 1
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
sendorgs[dns] = sendorgs.get(dns,0) + 1
print('')
print('Top',howmany,'Email list participants')
x = sorted(sendcounts, key=sendcounts.get, reverse=True)
for k in x[:howmany]:
print(senders[k], sendcounts[k])
if sendcounts[k] < 10 : break
print('')
print('Top',howmany,'Email list organizations')
x = sorted(sendorgs, key=sendorgs.get, reverse=True)
for k in x[:howmany]:
print(k, sendorgs[k])
if sendorgs[k] < 10 : break
Coursera课程笔记----P4E.Capstone----Week 4&5的更多相关文章
- Coursera课程笔记----P4E.Capstone----Week 6&7
Visualizing Email Data(Week 6&7) code segment gword.py import sqlite3 import time import zlib im ...
- Coursera课程笔记----P4E.Capstone----Week 2&3
Building a Search Engine(week 2&3) Search Engine Architecture Web Crawling Index Building Search ...
- 操作系统学习笔记----进程/线程模型----Coursera课程笔记
操作系统学习笔记----进程/线程模型----Coursera课程笔记 进程/线程模型 0. 概述 0.1 进程模型 多道程序设计 进程的概念.进程控制块 进程状态及转换.进程队列 进程控制----进 ...
- Coursera课程笔记----C++程序设计----Week3
类和对象(Week 3) 内联成员函数和重载成员函数 内联成员函数 inline + 成员函数 整个函数题出现在类定义内部 class B{ inline void func1(); //方式1 vo ...
- Coursera课程笔记----Write Professional Emails in English----Week 3
Introduction and Announcement Emails (Week 3) Overview of Introduction & Announcement Emails Bas ...
- Coursera课程笔记----Write Professional Emails in English----Week 1
Get to Know Basic Email Writing Structures(Week 1) Introduction to Course Email and Editing Basics S ...
- Coursera课程笔记----C程序设计进阶----Week 5
指针(二) (Week 5) 字符串与指针 指向数组的指针 int a[10]; int *p; p = a; 指向字符串的指针 指向字符串的指针变量 char a[10]; char *p; p = ...
- Coursera课程笔记----Write Professional Emails in English----Week 5
Culture Matters(Week 5) High/Low Context Communication High Context Communication The Middle East, A ...
- Coursera课程笔记----Write Professional Emails in English----Week 4
Request and Apology Emails(Week 4) How to Write Request Emails Write more POLITELY & SINCERELUY ...
随机推荐
- 资料整理:python自动化测试——操作测试对象
文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者:爱吃米饭的猪 PS:如有需要Python学习资料的小伙伴可以加点击下方链接自 ...
- F - Make It Equal CodeForces - 1065C
题目大意:有n座塔,塔高h[i],每次给定高度H对他们进行削切,要求每次削掉的所有格子数不能超过k个,输出最少削几次才能使所有塔的高度相同. 思路一:差分+贪心 对于每一个高度h,用一个数组让1~h的 ...
- element动态添加表头的正确姿势
1. 第一步循环 el-table-column <el-table-column v-if="item.show" v-for="(item, index) in ...
- 2020i春秋新春战疫
简单的招聘系统 登陆这里就可以注入 查询这里也可以注入 从登陆这里注入把 爆破数据库名 爆破表名 列名 flag 就很奇怪跑出来的东西 重开容器跑一遍列,估计是flaaag.后面可能是发生了502 再 ...
- React Hooks: use modal
useModal: export const useModal = (initTitle: string, initContent: string | React.ReactElement) => ...
- JDBC 进阶:使用封装通用DML DQL 和结构分层以及at com.mysql.jdbc.PreparedStatement.setTimestamp空指针异常解决
准备: 数据表 CREATE TABLE `t_user` ( `id` int(11) NOT NULL AUTO_INCREMENT, `username` varchar(10) DEFAULT ...
- deepin15.11小毛病解决
目录 边缘花屏问题 QQ`Tim头像问题 ssh卡死问题 看直播卡 边缘花屏问题 sudo apt install systemsettings 打开kde系统设置 打开显示与设置,修改如图下,基本上 ...
- Windows API 中 OVERLAPPED 结构体 初始化
出处:https://github.com/microsoft/Windows-classic-samples/blob/1d363ff4bd17d8e20415b92e2ee989d615cc0d9 ...
- Asp.Net Core 3.1学习-依赖注入、服务生命周期(6)
1.前言 面向对象设计(OOD)里有一个重要的思想就是依赖倒置原则(DIP),并由该原则牵引出依赖注入(DI).控制反转(IOC)及其容器等概念.在学习Core依赖注入.服务生命周期之前,下面让我们先 ...
- C# 基础知识系列- 14 IO篇 文件的操作
0. 前言 本章节是IO篇的第二集,我们在上一篇中介绍了C#中IO的基本概念和一些基本方法,接下来我们介绍一下操作文件的方法.在编程的世界中,操作文件是一个很重要的技能. 1. 文件.目录和路径 在开 ...