# -*- coding:utf-8 -*-

import sys
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os def iterbrowse(path):
for home, dirs, files in os.walk(path):
for filename in files:
yield os.path.join(home, filename) def extract_domain(domain):
suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'} domain = domain.lower()
names = domain.split(".")
if len(names) >= 3:
if ("."+".".join(names[-2:])) in suffix:
return ".".join(names[-3:]), ".".join(names[:-3])
elif ("."+names[-1]) in suffix:
return ".".join(names[-2:]), ".".join(names[:-2])
print "New domain suffix found. Use tld extract domain..." pos = domain.rfind("/")
if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
ext = tldextract.extract(domain[pos+1:])
subdomain = domain[:pos+1] + ext.subdomain
else:
ext = tldextract.extract(domain)
subdomain = ext.subdomain
if ext.suffix:
mdomain = ext.domain + "." + ext.suffix
else:
mdomain = ext.domain
return mdomain, subdomain def parse(log):
data = log.split('^')
SRC_PORT_IDX = 5-1
DST_PORT_IDX = 6-1
PROTOCOL_IDX = 7-1
protol = data[PROTOCOL_IDX]
dstport = data[DST_PORT_IDX]
if '' == protol and ('' == dstport):
DNS_QUERY_NAME_IDX = 55-1 # domain
if (len(data) < 55):
print "error line:"
print log
return ("", "")
domain = data[DNS_QUERY_NAME_IDX]
mdomain, subdomain = extract_domain(domain)
return (mdomain, subdomain)
else:
print "error line not a DNS:"
print log
return ("", "") #处理域名的最小长度
MIN_LEN=3 #状态个数
N=5
#最大似然概率阈值
T=-50 #模型文件名
FILE_MODEL="hmm-cdn.m" def get_cdn_domains(dir_path):
domain_list=[]
for path in iterbrowse(dir_path):
with open(path) as f:
for line in f:
mdomain, sub_domain = parse(line)
if len(sub_domain) >= MIN_LEN:
domain_list.append(sub_domain)
if len(domain_list) >= 2000:
return domain_list
#else:
# print path, "pass line:", line
return domain_list def domain2ver(domain):
ver=[]
for i in range(0,len(domain)):
ver.append([ord(domain[i])])
return ver def train_hmm(domain_list):
X = [[0]]
X_lens = [1]
for domain in domain_list:
ver=domain2ver(domain)
np_ver = np.array(ver)
#print len(np_ver)
try:
X=np.concatenate([X,np_ver])
except ValueError:
print domain
print len(X), len(np_ver)
print X
print np_ver
raise
X_lens.append(len(np_ver)) remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
remodel.fit(X,X_lens)
joblib.dump(remodel, FILE_MODEL) return remodel def test(remodel, domain_list):
x=[]
y=[]
for domain in domain_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
x.append(len(domain))
y.append(pro)
return x,y if __name__ == '__main__':
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
remodel=train_hmm(domain_list)
remodel=joblib.load(FILE_MODEL) x_1,y_1=test(remodel, domain_list)
print x_1
print y_1
#sys.exit(0)
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
x_2,y_2=test(remodel, domain_list)
print x_2
print y_2
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
x_3,y_3=test(remodel, domain_list)
print x_3
print y_3
#%matplotlib inline
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('HMM Score')
ax.scatter(x_3,y_3,color='b',label="WHITE")
ax.scatter(x_2, y_2, color='g', label="BLACK")
ax.scatter(x_1, y_1, color='r', label="CDN")
ax.legend(loc='right')
plt.show()

使用pickle保存和加载模型:

# -*- coding:utf-8 -*-

import sys
import re
from hmmlearn import hmm
import numpy as np
#from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os
import pickle def iterbrowse(path):
for home, dirs, files in os.walk(path):
for filename in files:
yield os.path.join(home, filename) def extract_domain(domain):
suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'} domain = domain.lower()
names = domain.split(".")
if len(names) >= 3:
if ("."+".".join(names[-2:])) in suffix:
return ".".join(names[-3:]), ".".join(names[:-3])
elif ("."+names[-1]) in suffix:
return ".".join(names[-2:]), ".".join(names[:-2])
print "New domain suffix found. Use tld extract domain..." pos = domain.rfind("/")
if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
ext = tldextract.extract(domain[pos+1:])
subdomain = domain[:pos+1] + ext.subdomain
else:
ext = tldextract.extract(domain)
subdomain = ext.subdomain
if ext.suffix:
mdomain = ext.domain + "." + ext.suffix
else:
mdomain = ext.domain
return mdomain, subdomain def parse(log):
data = log.split('^')
SRC_PORT_IDX = 5-1
DST_PORT_IDX = 6-1
PROTOCOL_IDX = 7-1
protol = data[PROTOCOL_IDX]
dstport = data[DST_PORT_IDX]
if '' == protol and ('' == dstport):
DNS_QUERY_NAME_IDX = 55-1 # domain
if (len(data) < 55):
print "error line:"
print log
return ("", "")
domain = data[DNS_QUERY_NAME_IDX]
mdomain, subdomain = extract_domain(domain)
return (mdomain, subdomain)
else:
print "error line not a DNS:"
print log
return ("", "") #处理域名的最小长度
MIN_LEN=1 #状态个数
N=8
#最大似然概率阈值
T=-50 #模型文件名
FILE_MODEL="hmm-cdn.m"
FILE_MODEL2 ="hmm-cdn-white.pkl" def get_cdn_domains(dir_path):
domain_list=[]
for path in iterbrowse(dir_path):
with open(path) as f:
for line in f:
mdomain, sub_domain = parse(line)
if len(sub_domain) >= MIN_LEN:
domain_list.append(sub_domain)
if len(domain_list) >= 3000:
return domain_list
#else:
# print path, "pass line:", line
return domain_list def domain2ver(domain):
ver=[]
for i in range(0,len(domain)):
ver.append([ord(domain[i])])
return ver def train_hmm(domain_list):
if os.path.exists(FILE_MODEL2):
print "found model file, use it..."
file_model = open(FILE_MODEL2, 'rb')
model = pickle.load(file_model)
file_model.close()
return model X = [[0]]
X_lens = [1]
for domain in domain_list:
ver=domain2ver(domain)
np_ver = np.array(ver)
#print len(np_ver)
try:
X=np.concatenate([X,np_ver])
except ValueError:
print domain
print len(X), len(np_ver)
print X
print np_ver
raise
X_lens.append(len(np_ver)) #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
remodel.fit(X,X_lens)
#joblib.dump(remodel, FILE_MODEL) file_model = open(FILE_MODEL2, 'wb')
pickle.dump(remodel, file_model)
file_model.close() return remodel def test(remodel, domain_list):
x=[]
y=[]
for domain in domain_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
x.append(len(domain))
y.append(pro)
return x,y if __name__ == '__main__':
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
#remodel=train_hmm(domain_list)
remodel=train_hmm(domain_list+domain_list2)
#remodel=joblib.load(FILE_MODEL) x_1,y_1=test(remodel, domain_list)
print x_1
print y_1
#sys.exit(0)
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
x_2,y_2=test(remodel, domain_list)
print x_2
print y_2
domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
x_3,y_3=test(remodel, domain_list)
print x_3
print y_3
#%matplotlib inline
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('HMM Score')
#ax.scatter(x_3,y_3,color='b',label="WHITE")
ax.scatter(x_2, y_2, color='g', label="DNS tunnel")
ax.scatter(x_1, y_1, color='r', label="CDN")
ax.legend(loc='right')
plt.show()

其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。

def train_hmm(domain_list):
if os.path.exists(FILE_MODEL2):
print "found model file, use it..."
file_model = open(FILE_MODEL2, 'rb')
model = pickle.load(file_model)
file_model.close()
return model #X = [[0]]
#X_lens = [1]
X = []
X_lens = []
#print X
for domain in domain_list:
ver=domain2ver(domain)
#np_ver = np.array(ver)
try:
#X=np.concatenate([X,np_ver])
X = X + ver
except ValueError:
print domain
print X
print ver
raise
X_lens.append(len(ver))
#remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
remodel.fit(X,X_lens)
#joblib.dump(remodel, FILE_MODEL) file_model = open(FILE_MODEL2, 'wb')
pickle.dump(remodel, file_model)
file_model.close() return remodel

hmm CDN检测的更多相关文章

  1. HMM XSS检测

    HMM XSS检测 转自:http://www.freebuf.com/articles/web/133909.html 前言 上篇我们介绍了HMM的基本原理以及常见的基于参数的异常检测实现,这次我们 ...

  2. 绕过CDN查找真实IP方法总结

    CDN的全称是Content Delivery Network,即内容分发网络.CDN是构建在现有网络基础之上的智能虚拟网络,依靠部署在各地的边缘服务器,通过中心平台的负载均衡.内容分发.调度等功能模 ...

  3. [转载]绕过CDN查找真实IP方法总结

    前言 类似备忘录形式记录一下,这里结合了几篇绕过CDN寻找真实IP的文章,总结一下绕过CDN查找真实的IP的方法 介绍 CDN的全称是Content Delivery Network,即内容分发网络. ...

  4. web渗透测试

    信息收集 网络搜索 目录遍历:site:域名 intitle:index.of 配置文件泄露:site:域名 ext:xml | ext:conf | ext:cnf | ext:reg | ext: ...

  5. 基于Python的渗透测试信息收集系统的设计和实现

    信息收集系统的设计和实现 渗透测试是保卫网络安全的一种有效且必要的技术手段,而渗透测试的本质就是信息收集,信息搜集整理可为后续的情报跟进提供强大的保证,目标资产信息搜集的广度,决定渗透过程的复杂程度, ...

  6. 脚本检测CDN节点资源是否与源站资源一致

    需求: 1.所有要检测的资源url放到一个单独文件中 2.检测cdn节点资源大小与源站文件大小是否一致 3.随机抽查几个资源,检查md5sum是否一致 4.使用多线程,可配置线程数 代码目录: hex ...

  7. 简单检测CDN链接是否有效

    CDN链接经常是使用的.但是,CDN链接挂了怎么办,因此,就要调用使用本站点的库,那么怎么实现呢? 检测CDN的jquery链接是否有效(这种方法比较简单) <script src=" ...

  8. 大数据DDos检测——DDos攻击本质上是时间序列数据,t+1时刻的数据特点和t时刻强相关,因此用HMM或者CRF来做检测是必然! 和一个句子的分词算法CRF没有区别!

    DDos攻击本质上是时间序列数据,t+1时刻的数据特点和t时刻强相关,因此用HMM或者CRF来做检测是必然!——和一个句子的分词算法CRF没有区别!注:传统DDos检测直接基于IP数据发送流量来识别, ...

  9. 基于机器学习的web异常检测——基于HMM的状态序列建模,将原始数据转化为状态机表示,然后求解概率判断异常与否

    基于机器学习的web异常检测 from: https://jaq.alibaba.com/community/art/show?articleid=746 Web防火墙是信息安全的第一道防线.随着网络 ...

随机推荐

  1. mac安装python3 pandas tushare

    1,升级pip python3 -m pip install --upgrade pip 2,安装依赖包 pip install --user numpy scipy jupyter pandas s ...

  2. mysqldump+mydumper+xtrabackup备份原理流程

    mysqldump备份原理 备份的基本流程如下: 1.调用FTWRL(flush tables with read lock),全局禁止读写 2.开启快照读,获取此时的快照(仅对innodb表起作用) ...

  3. 使用Word 2010群发邮件

    1.建立数据库,这里我使用了excel 字段:电子邮件地址,名字 填写需要发送的数据 2.新建word文档,这里我使用了word2010 点击工具栏邮件 开始邮件合并,电子邮件 选择收件人,使用现有列 ...

  4. React+Antd遇到的坑

    第一次尝试React+antd,发现果然不愧是传说中的坑货,一个又一个坑.必须要记录. react + antd,都是最新版本,使用npm和yarn各种add,build,start 1. 资源文件, ...

  5. (转)基于MVC4+EasyUI的Web开发框架形成之旅--附件上传组件uploadify的使用

    http://www.cnblogs.com/wuhuacong/p/3343967.html 大概一年前,我还在用Asp.NET开发一些行业管理系统的时候,就曾经使用这个组件作为文件的上传操作,在随 ...

  6. VMware虚拟机共享文件夹问题: /mnt下没有hgfs文件夹

    在使用vmware虚拟机共享文件夹功能的时候,发现在/mnt目录下面没有hgfs文件夹,但是vmware-tool的命令vmhgfs-fuse确实存在于系统中.在使用vmhgfs-fuse建立宿主机到 ...

  7. timeval的时间转换成毫秒之后多大的数据类型可以装下

    struct timeval { long tv_sec; /*秒*/ long tv_usec; /*微秒*/ }; 秒的定义为long,为了防止溢出,转换成毫秒之后保存在long long中

  8. 常用css和js内容

    1.让一个200x200的div在不同分辨率屏幕上下左右居中. <div class="box"></div> <style type="t ...

  9. POJ 2115 C Looooops( 简单拓欧 + 快速幂 )

    链接:传送门 题意:题目中给出一个循环 for (variable = A; variable != B; variable += C) ,这个东东还需要 mod 2^k 问至少多次能退出,如果进入死 ...

  10. js:Array对象常用方法介绍

    前言 在js中,数组作为一个特殊的对象.是我们常用的数据格式.今天就来梳理一下常用的数组方法. 1.基础 几种基础的就简单介绍一下:创建数组 var arr1 = new Array(); //括号可 ...