研究开源源码之Myrmec
好久没写博客了,自己也弄不清是懒了还是忙了。毕竟白天需要工作,晚上有时候看看资料,有时候陪家人,有时候约朋友......更加累了,可能由于累了就懒得总结了。
今天有同事问我关于代码检查文件类型的问题。当然由于安全性不能直接使用文件后缀或者Mime检查,需要读取文件头,根据文件头来判断(作为判断的其中一种依据)。后来发现直接使用Myrmec是没有CSS和HTML文件检查的。不应该啊!无赖中只能上Github看看工具源码了。
Myrmec 是一个用于检测文件格式的库,Myrmec不同于其它库或者手写检测代码,Myrmec不依赖文件扩展名(在实际使用中,你的用户很可能使用虚假的扩展名欺骗你的应用程序),Myrmec会检测文件的二进制头,并在其元数据库中匹配来获得文件的格式。
例如Jpg图片的二进制头是 "FF D8 FF DB" 那么Myrmec会匹配到这个文件头,并获得两个结果--"jpg"和"jpeg"。
具体使用我就不详细说了,毕竟百度就有,而且比较简单(真的很简单,哈哈)。直接上源码地址https://github.com/rocketRobin/myrmec。
代码其实也是比较简单的,其中FileTypes是已经标记为过期的类,里面定义了很多标记文件类型的16进制,用List<Record>存储。
- // <copyright file="FileTypes.cs" company="Rocket Robin">
- // Copyright (c) Rocket Robin. All rights reserved.
- // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.
- // </copyright>
- using System;
- using System.Collections.Generic;
- namespace Myrmec
- {
- /// <summary>
- /// Common file types for populate a new sniffer instance.
- /// </summary>
- [Obsolete("please use populate the file types only you need.")]
- public class FileTypes
- {
- static FileTypes()
- {
- Unfrequent = new List<Record>
- {
- new Record("bin", "53 50 30 31"),
- new Record("bac", "42 41 43 4B 4D 49 4B 45 44 49 53 4B"),
- new Record("bz2", "42 5A 68"),
- new Record("tif tiff", "49 49 2A 00"),
- new Record("tif tiff", "4D 4D 00 2A"),
- new Record("cr2", "49 49 2A 00 10 00 00 00 43 52"),
- new Record("cin", "80 2A 5F D7"),
- new Record("exr", "76 2F 31 01"),
- new Record("dpx", "53 44 50 58"),
- new Record("dpx", "58 50 44 53"),
- new Record("bpg", "42 50 47 FB"),
- new Record("lz", "4C 5A 49 50"),
- new Record("ps", "25 21 50 53"),
- new Record("fits", "3D 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 54"),
- new Record("doc xls ppt msg", "D0 CF 11 E0 A1 B1 1A E1"),
- new Record("dex", "64 65 78 0A 30 33 35 00"),
- new Record("vmdk", "4B 44 4D"),
- new Record("crx", "43 72 32 34"),
- new Record("cwk", "05 07 00 00 42 4F 42 4F 05 07 00 00 00 00 00 00 00 00 00 00 00 01"),
- new Record("fh8", "41 47 44 33"),
- new Record("cwk", "06 07 E1 00 42 4F 42 4F 06 07 E1 00 00 00 00 00 00 00 00 00 00 01"),
- new Record("toast", "45 52 02 00 00 00"),
- new Record("toast", "8B 45 52 02 00 00 00"),
- new Record("xar", "78 61 72 21"),
- new Record("dat", "50 4D 4F 43 43 4D 4F 43"),
- new Record("nes", "4E 45 53 1A"),
- new Record("tox", "74 6F 78 33"),
- new Record("MLV", "4D 4C 56 49"),
- new Record("lz4", "04 22 4D 18"),
- new Record("cab", "4D 53 43 46"),
- new Record("flif", "46 4C 49 46"),
- new Record("stg", "4D 49 4C 20"),
- new Record("der", "30 82"),
- new Record("wasm", "00 61 73 6d"),
- new Record("lep", "cf 84 01"),
- new Record("rtf", "7B 5C 72 74 66 31"),
- new Record("m2p vob", "00 00 01 BA"),
- new Record("zlib", "78 01"),
- new Record("zlib", "78 9c"),
- new Record("zlib", "78 da"),
- new Record("lzfse", "62 76 78 32"),
- new Record("orc", "4F 52 43"),
- new Record("avro", "4F 62 6A 01"),
- new Record("rc", "53 45 51 36"),
- new Record("tbi", "00 00 00 00 14 00 00 00"),
- new Record("dat", "00 00 00 00 62 31 05 00 09 00 00 00 00 20 00 00 00 09 00 00 00 00 00 00", , "Bitcoin Core wallet.dat file"),
- new Record("jp2", "00 00 00 0C 6A 50 20 20 0D 0A", "Various JPEG-2000 image file formats"),
- new Record("ttf", "00 01 00 00 00"),
- new Record("mdf", "00 FF FF FF FF FF FF FF FF FF FF 00 00 02 00 01"),
- // Complex file type.
- new Record("pdb", "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00", ),
- new Record("3gp 3g2", "66 74 79 70 33 67", ),
- new Record("iso", "43 44 30 30 31", ),
- new Record("iso", "43 44 30 30 31", ),
- new Record("iso", "43 44 30 30 31", ),
- };
- Common = new List<Record>
- {
- new Record("asf wma wmv", "30 26 B2 75 8E 66 CF 11 A6 D9 00 AA 00 62 CE 6C"),
- new Record("ogg oga ogv", "4F 67 67 53"),
- new Record("psd", "38 42 50 53"),
- new Record("mp3", "FF FB"),
- new Record("mp3", "49 44 33"),
- new Record("bmp dib", "42 4D"),
- new Record("jpg,jpeg", "ff,d8,ff,db"),
- new Record("png", "89,50,4e,47,0d,0a,1a,0a"),
- new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,03,04"),
- new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,07,08"),
- new Record("zip,jar,odt,ods,odp,docx,xlsx,pptx,vsdx,apk,aar", "50,4b,05,06"),
- new Record("rar", "52,61,72,21,1a,07,00"),
- new Record("rar", "52,61,72,21,1a,07,01,00"),
- new Record("class", "CA FE BA BE"),
- new Record("pdf", "25 50 44 46"),
- new Record("rpm", "ed ab ee db"),
- new Record("flac", "66 4C 61 43"),
- new Record("mid midi", "4D 54 68 64"),
- new Record("ico", "00 00 01 00"),
- new Record("z,tar.z", "1F 9D"),
- new Record("z,tar.z", "1F A0"),
- new Record("gif", "47 49 46 38 37 61"),
- new Record("dmg", "78 01 73 0D 62 62 60"),
- new Record("gif", "47 49 46 38 39 61"),
- new Record("exe", "4D 5A"),
- new Record("tar", "75 73 74 61 72", ),
- new Record("mkv mka mks mk3d webm", "1A 45 DF A3"),
- new Record("gz tar.gz", "1F 8B"),
- new Record("xz tar.xz", "FD 37 7A 58 5A 00 00"),
- new Record("7z", "37 7A BC AF 27 1C"),
- new Record("mpg mpeg", "00 00 01 BA"),
- new Record("mpg mpeg", "00 00 01 B3"),
- new Record("woff", "77 4F 46 46"),
- new Record("woff2", "77 4F 46 32"),
- new Record("XML", "3c 3f 78 6d 6c 20"),
- new Record("swf", "43 57 53"),
- new Record("swf", "46 57 53"),
- new Record("deb", "21 3C 61 72 63 68 3E"),
- // complext
- new Record("jpg,jpeg","FF D8 FF E0 ?? ?? 4A 46 49 46 00 01"),
- new Record("jpg,jpeg","FF D8 FF E1 ?? ?? 45 78 69 66 00 00"),
- };
- }
- /// <summary>
- /// Gets CommonFileTypes.
- /// Replace this with <see cref="Common"/>
- /// </summary>
- [Obsolete("please use populate the file types only you need.")]
- public static List<Record> CommonFileTypes { get => Common; }
- /// <summary>
- /// Gets Common It contains the format of the file we often see.
- /// </summary>
- [Obsolete("please use populate the file types only you need.")]
- public static List<Record> Common { get; set; }
- /// <summary>
- /// It contains unfrequent file formats.
- /// </summary>
- [Obsolete("please use populate the file types only you need.")]
- public static List<Record> Unfrequent { get; set; }
- }
- }
FileTypes
Record类是用于存储文件类型和16进制对应。
- // <copyright file="Record.cs" company="Rocket Robin">
- // Copyright (c) Rocket Robin. All rights reserved.
- // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.
- // </copyright>
- namespace Myrmec
- {
- /// <summary>
- /// Present one record.
- /// </summary>
- public class Record
- {
- /// <summary>
- /// Initializes a new instance of the <see cref="Record"/> class.
- /// </summary>
- public Record()
- {
- }
- /// <summary>
- /// Initializes a new instance of the <see cref="Record"/> class.
- /// </summary>
- /// <param name="extentions">extentions string ,split with "," what if it has many.</param>
- /// <param name="hex">hex string, split with ",".</param>
- public Record(string extentions, string hex)
- {
- Extentions = extentions;
- Hex = hex;
- }
- /// <summary>
- /// Initializes a new instance of the <see cref="Record"/> class.
- /// </summary>
- /// <param name="extentions">Extentions format string.</param>
- /// <param name="hex">File hex head format string.</param>
- /// <param name="offset">Offset of this record.</param>
- public Record(string extentions, string hex, int offset)
- {
- Offset = offset;
- Extentions = extentions;
- Hex = hex;
- }
- /// <summary>
- /// Initializes a new instance of the <see cref="Record"/> class.
- /// </summary>
- /// <param name="extentions">extentions string ,split with "," what if it has many.</param>
- /// <param name="hex">hex string, split with ",".</param>
- /// <param name="description">description</param>
- public Record(string extentions, string hex, string description)
- {
- Extentions = extentions;
- Hex = hex;
- Description = description;
- }
- /// <summary>
- /// Initializes a new instance of the <see cref="Record"/> class.
- /// </summary>
- /// <param name="extentions">extentions string ,split with "," what if it has many.</param>
- /// <param name="hex">hex string, split with ",".</param>
- /// <param name="offset"></param>
- /// <param name="description">description</param>
- public Record(string extentions, string hex, int offset, string description)
- {
- Offset = offset;
- Extentions = extentions;
- Hex = hex;
- Description = description;
- }
- /// <summary>
- /// Gets or sets Description
- /// </summary>
- public string Description { get; set; }
- /// <summary>
- /// Gets or sets file extentions.
- /// </summary>
- public string Extentions { get; set; }
- /// <summary>
- /// Gets or sets Hex String.
- /// </summary>
- public string Hex { get; set; }
- /// <summary>
- /// Gets or sets offset
- /// </summary>
- public int Offset { get; set; }
- /// <summary>
- /// Gets a value indicating whether this record has offset or contain a variable byte or not.
- /// </summary>
- public bool IsComplexMetadata
- {
- get => (Offset > ) || (Hex.Contains("?"));
- }
- }
- }
Record
Node类用于存储类型查询树,用于方便类型查询的。
- // <copyright file="Node.cs" company="Rocket Robin">
- // Copyright (c) Rocket Robin. All rights reserved.
- // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.
- // </copyright>
- using System.Collections.Generic;
- namespace Myrmec
- {
- /// <summary>
- /// node
- /// </summary>
- public class Node
- {
- /// <summary>
- /// Initializes a new instance of the <see cref="Node"/> class.
- /// </summary>
- public Node()
- {
- }
- /// <summary>
- /// Gets or sets children.
- /// </summary>
- public SortedList<byte, Node> Children { get; set; }
- /// <summary>
- /// Gets or sets depth.
- /// </summary>
- public int Depth { get; set; }
- /// <summary>
- /// Gets or sets extentions.
- /// </summary>
- public List<string> Extentions { get; set; }
- /// <summary>
- /// Gets or sets parent node.
- /// </summary>
- public Node Parent { get; set; }
- }
- }
Node
Sniffer这个工具类是嗅探器用于提供根据文件16进制头判断类型等功能,很多核心算法都是在这个类中,后面我详细介绍这个算法
- // <copyright file="Sniffer.cs" company="Rocket Robin">
- // Copyright (c) Rocket Robin. All rights reserved.
- // Licensed under the Apache v2 license. See LICENSE file in the project root for full license information.
- // </copyright>
- using System;
- using System.Collections.Generic;
- using System.Linq;
- namespace Myrmec
- {
- /// <summary>
- /// sniffer
- /// </summary>
- public class Sniffer
- {
- /// <summary>
- /// You can get the file extention name detail in this wikipedia page.
- /// </summary>
- public const string FileExtentionHelpUrl = "https://en.wikipedia.org/wiki/List_of_file_signatures";
- private Node _root;
- /// <summary>
- /// Initializes a new instance of the <see cref="Sniffer"/> class.
- /// </summary>
- public Sniffer()
- {
- _root = new Node()
- {
- Children = new SortedList<byte, Node>(),
- Depth = -,
- };
- ComplexMetadata = new List<Metadata>();
- }
- /// <summary>
- /// Gets or sets ComplexMetadatas.
- /// </summary>
- public List<Metadata> ComplexMetadata { get; set; }
- /// <summary>
- /// Add a record to matadata tree.
- /// </summary>
- /// <param name="data">file head.</param>
- /// <param name="extentions">file extention list.</param>
- public void Add(byte[] data, string[] extentions)
- {
- Add(data, _root, extentions, );
- }
- /// <summary>
- ///
- /// </summary>
- /// <param name="record"></param>
- public void Add(Record record)
- {
- if (record.IsComplexMetadata)
- {
- ComplexMetadata.Add(record);
- }
- else
- {
- Add(record.Hex.GetByte(), record.Extentions.Split(',', ' '));
- }
- }
- /// <summary>
- /// Find extentions that match the file hex head.
- /// </summary>
- /// <param name="data">file hex head</param>
- /// <param name="matchAll">match all result or only the first.</param>
- /// <returns>matched result</returns>
- public List<string> Match(byte[] data, bool matchAll = false)
- {
- List<string> extentionStore = new List<string>();
- Match(data, , _root, extentionStore, matchAll);
- if (matchAll || !extentionStore.Any())
- {
- // Match data from complex metadata.
- extentionStore.AddRange(ComplexMetadata.Match(data, matchAll));
- }
- // Remove repeated extentions.
- if (matchAll && extentionStore.Any())
- {
- extentionStore = extentionStore.Distinct().ToList();
- }
- return extentionStore;
- }
- private void Add(byte[] data, Node parent, string[] extentions, int depth)
- {
- Node current = null;
- if (parent.Children == null)
- {
- parent.Children = new SortedList<byte, Node>(Convert.ToInt32( / Math.Pow(, depth)));
- }
- // if not contains current byte index, create node and put it into children.
- if (!parent.Children.ContainsKey(data[depth]))
- {
- current = new Node
- {
- Depth = depth,
- Parent = parent
- };
- parent.Children.Add(data[depth], current);
- }
- else
- {
- if (!parent.Children.TryGetValue(data[depth], out current))
- {
- throw new Exception("No possibility, something fucked up...");
- }
- }
- // last byte, put extentions into Extentions.
- if (depth == (data.Length - ))
- {
- if (current.Extentions == null)
- {
- current.Extentions = new List<string>();
- }
- current.Extentions.AddRange(extentions);
- return;
- }
- Add(data, current, extentions, depth + );
- }
- private void Match(byte[] data, int depth, Node node, List<string> extentionStore, bool matchAll)
- {
- // if depth out of data.Length's index then data end.
- if (data.Length == depth)
- {
- return;
- }
- node.Children.TryGetValue(data[depth], out Node current);
- // can't find matched node, match ended.
- if (current == null)
- {
- return;
- }
- // now extentions not null, this node is a final node and this is a result.
- if (current.Extentions != null)
- {
- extentionStore.AddRange(current.Extentions);
- // if only match first matched.
- if (!matchAll)
- {
- return;
- }
- }
- // children is null, match ended.
- if (current.Children == null)
- {
- return;
- }
- // children not null, keep match.
- Match(data, depth + , current, extentionStore, matchAll);
- }
- }
- }
Sniffer
好了,上面就是几个核心的类,现在讲解一下核心源码。类似这样的NUnit代码
- [TestMethod]
- public void SnifferTest()
- {
- var sniffer = new Sniffer();
- sniffer.Populate(FileTypes.Common);
- sniffer.Populate(FileTypes.Unfrequent);
- var head = new byte[]
- {
- 0xff,
- 0xd8,
- 0xff,
- 0xdb
- };
- var result = sniffer.Match(head,true);
- Assert.IsTrue(result.Contains("jpg"));
- Assert.IsTrue(result.Contains("jpeg"));
- }
Demo
var sniffer = new Sniffer();这个是定义和初始化嗅探器,在这个过程中会使用Node类初始化树,这个树用于存储所有FileTypes下定义的节点。
- public Sniffer()
- {
- _root = new Node()
- {
- Children = new SortedList<byte, Node>(),
- Depth = -,
- };
- ComplexMetadata = new List<Metadata>();
- }
初始化
sniffer.Populate(FileTypes.Common);这个作用是生成树。生成过程是一个递归算法
- public static void Populate(this Sniffer sniffer, IList<Record> records)
- {
- foreach (var record in records)
- {
- sniffer.Add(record);
- }
- }
- public void Add(Record record)
- {
- if (record.IsComplexMetadata)
- {
- ComplexMetadata.Add(record);
- }
- else
- {
- Add(record.Hex.GetByte(), record.Extentions.Split(',', ' '));
- }
- }
- public void Add(byte[] data, string[] extentions)
- {
- Add(data, _root, extentions, );
- }
- private void Add(byte[] data, Node parent, string[] extentions, int depth)
- {
- Node current = null;
- if (parent.Children == null)
- {
- parent.Children = new SortedList<byte, Node>(Convert.ToInt32( / Math.Pow(, depth)));
- }
- // if not contains current byte index, create node and put it into children.
- if (!parent.Children.ContainsKey(data[depth]))
- {
- current = new Node
- {
- Depth = depth,
- Parent = parent
- };
- parent.Children.Add(data[depth], current);
- }
- else
- {
- if (!parent.Children.TryGetValue(data[depth], out current))
- {
- throw new Exception("No possibility, something fucked up...");
- }
- }
- // last byte, put extentions into Extentions.
- if (depth == (data.Length - ))
- {
- if (current.Extentions == null)
- {
- current.Extentions = new List<string>();
- }
- current.Extentions.AddRange(extentions);
- return;
- }
- Add(data, current, extentions, depth + );
- }
生成树
生成树过程是首选把FileTypes.Common中预先定义Record列表的hex通过空格和逗号分隔,然后转成10进制,例如new Record("ogg oga ogv", "4F 67 67 53")。hex部分就是 "4F 67 67 53",转换为10进制就是“79 103 103 83”,然后把这数字串生成树,生成图如下:
如果再加上一个Record节点,如果有相同顺序部分,就使用相同的节点,例如再有一个路径是new Record("XXX", "4F 67 01"),此时树节点如下:
明白了上面树结构,就很容易知道这树的作用了,就是为了提高查询的速度的。var result = sniffer.Match(head,true);就可以通过树的路径找到文件类型,即叶子节点对应的类型属性。
好了,这个工具其实原理很简单,就是使用的树结构提高了查询效率而已。
- 嗅探器
研究开源源码之Myrmec的更多相关文章
- ASP.NET Aries 2.0 发布(原来的源码SVN已关闭,开源源码已迁移到GitHub)
主要更新: 1:增加子目录部署支持. 2:增加Taurus.MVC支持. 3:优化及Bug修复. 1:增加子目录部署支持: 其实在重写Aries框架的时候,我是去掉了目录部署功能的,主要是为了加快Ar ...
- PDF.NET 开发框架之 SOD框架 Ver 5.2 正式版开源源码发布
PDF.NET 开发框架之 SOD框架 Ver 5.2.1.0307 正式版发布,包含以下部分: SOD_Pwmis.Core --包括下列数据提供程序 SqlServer SqlServerCe A ...
- 关于在线查看相关开源源码的网站,包括Android源码
无废话,纯干货! 各种源码自行搜索: http://grepcode.com/ Android源码:http://grepcode.com/project/repository.grepcode.co ...
- Android开源源码推荐(一)
1.Android-ViewPagerIndicator http://www.akaifa.com/code/86/android-viewpagerindicator 实现各种样式的滑动视图(Sc ...
- SpringMVC关于json、xml自动转换的原理研究[附带源码分析 --转
SpringMVC关于json.xml自动转换的原理研究[附带源码分析] 原文地址:http://www.cnblogs.com/fangjian0423/p/springMVC-xml-json-c ...
- 研究VCL源码的原因和起点
---恢复内容开始--- 研究VCL源码的原因和起点 根本原因:当然是希望自己成为Delphi高手,因为这么多年过去,觉得自己始终不得要领,修改一个控件都无从下手,一直都只是个会拖控件的白痴.而我却拥 ...
- Jmeter自动化测试 POST请求和GET请求用if控制器,可以二次开发源码,将请求方式通过数据源传入,就不需要做多余的判断
Jmeter自动化测试 POST请求和GET请求用if控制器,可以二次开发源码,将请求方式通过数据源传入,就不需要做多余的判断 目前常用的做法:
- SpringMVC关于json、xml自动转换的原理研究[附带源码分析]
目录 前言 现象 源码分析 实例讲解 关于配置 总结 参考资料 前言 SpringMVC是目前主流的Web MVC框架之一. 如果有同学对它不熟悉,那么请参考它的入门blog:http://www.c ...
- 《Thinking in Android 9.0 系统开发源码钻研录》
最近打算把个人站点的博客文章同步到"博客园"! Thinking in Android -- "系统启动" [启动阶段] [相关文章] 状态 源码版本 init ...
随机推荐
- Java面试题4-附答案
BIO.NIO和AIO的区别 Java BIO : 同步并阻塞,服务器实现模式为一个连接一个线程,即客户端有连接请求时服务器端就需要启动一个线程进行处理,如果这个连接不做任何事情会造成不必要的线程 ...
- idea出现 Unable to open debugger port (127.0.0.1:xxxx): java.net.SocketException "socket closed" 解决方案
第一种:重启电脑,太费劲: 第二种: 1)根据端口号找到进程pid netstat -aon|findstr "1099" 2)杀掉进程pid即可 netstat -aon|fin ...
- MOOC(12) - 安装连接数据库的第三方库
1.连接数据库需要mysql-python驱动,可以官网下载离线安装包 安装 检查是否导入成功
- Qt platform plugin 'windows' 问题的解决方法
关于Qt 发布程序时遇到qt platform plugin ‘windows’问题的解决方法如下 遇到这个问题,一般应该已经把一部分dll拷贝到了发布的可执行文件同级目录, 我是直接添加C:\Qt ...
- 文件加密,密码加密,os模块
序列化模块 (非常非常重要) 序列化:将一个数据结构(list,dict....)转化成一个特殊的序列(特殊的字符串)的过程. # l1 = [1, 2, 3] # ret = str(l1) # p ...
- springboot学习笔记:7.IDEA下5步完成热部署配置
开发工具IDEA 2017.02 JDK1.8 1.pom.xml中增加: <dependency> <groupId>org.springframework.boot&l ...
- 使printf打印信息带有颜色
#define NONE "\033[m"#define RED "\033[0;32;31m"#define LIGHT_RED "\033[1;3 ...
- !!误解--var vm = new vue({}) 与 export default {} 是一回事儿??
这两者完全不是同一回事.export default {} 是es6的module中的语法, 而var vm = new vue({}) 是创建一个vue实例.引起误解是因为用了webpack开发vu ...
- hibernate需要注意的点
1.需要用Hibernate做实体的类(@Entity)需要在配置文件中配置对应的包(例如:spring/appContext-hibernate.xml). 2.hibernateTemplate中 ...
- Python爬虫-selenium的使用(2)
使用selenium打开chrome浏览器百度进行搜索 12345678910111213141516171819202122232425 from selenium import webdriver ...