using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading; namespace GearUp.Crawler
{
public class Crawler
{
private ILoreBookItemRepository repository;
private ILorebookItemParser parser;
private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
{
this.repository = repository;
this.parser = parser;
this.linkManager = linkManager;
} public async void StartCrawl(string targetUrl)
{
var cts = new CancellationTokenSource();
var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 3,
MaxDegreeOfParallelism = 4,
BoundedCapacity = 10
}; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 2,
CancellationToken = ct
}; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup
downloader.LinkTo(contentBroadcaster);
contentBroadcaster.LinkTo(linkParser);
contentBroadcaster.LinkTo(writer);
linkParser.LinkTo(downloader); //Kick off the TPL dataflow here
downloader.Post(targetUrl);
WriteToConsole("Crawling...", ConsoleColor.Green);
PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
cts.Cancel();
WriteToConsole("Stopping...", ConsoleColor.Green);
await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
{
if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>();
var document = new LorebookDocument(page.Html);
foreach (var link in document.LinksInArticleBodyDiv())
{
var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
discoveredLinks.Add(fullUrl);
}
WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
return discoveredLinks;
} public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
{
WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
var itemDetails = document.OfficialLorebookEntry();
var item = parser.ParseHtmlNode(itemDetails, url);
return item;
} public async Task<PageAndUrl> DownloadUrl(string url)
{
try
{
if (urls.ContainsKey(url)) return null;
urls.TryAdd(url, true); var client = new WebClient();
WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
var download = client.DownloadStringTaskAsync(url);
var cancel = Task.Delay(DownloadTimeout * 1000);
var any = await Task.WhenAny(download, cancel);
if (any == cancel)
{
client.CancelAsync();
WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
return null;
}
string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result };
} catch (WebException ex)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
}
catch (AggregateException ex)
{
foreach (var exc in ex.Flatten().InnerExceptions)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
}
}
catch (Exception ex)
{
WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
} return null;
} public async Task SaveEntry(PageAndUrl page)
{
if (page == null) return;
var document = new LorebookDocument(page.Html);
var item = ExtractLoreBookItem(document, page.Url);
if (item != null) await repository.Save(page.Url, item);
} private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
{
Console.ForegroundColor = color;
Console.WriteLine(format, texts);
Console.ResetColor();
} private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
{
WriteToConsole(message, color);
if (key == null)
Console.ReadLine();
else
{
ConsoleKeyInfo entry;
do
{
entry = Console.ReadKey(true);
} while (key != entry.Key);
}
} }
}

tdf sample的更多相关文章

  1. Linux下UPnP sample分析

        一.UPnP简介   UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...

  2. cocos2d-x for android配置 & 运行 Sample on Linux OS

    1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...

  3. android studio2.2 的Find Sample Code点击没有反应

    1 . 出现的问题描述:           右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...

  4. jmeter(四)Sample之http请求

    启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...

  5. jcaptcha sample 制作验证码

    Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...

  6. Python 对不均衡数据进行Over sample(重抽样)

    需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...

  7. Basic linux command-with detailed sample

    Here I will list some parameters which people use very ofen, I will attach the output of the command ...

  8. 例子:RSS Reader Sample

    本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...

  9. 例子:Background Audio Streamer Sample

    The Background Audio Streamer sample demonstrates how to create an app that uses a MediaStreamSource ...

随机推荐

  1. 页面加载时的div动画

    用@keyframes(动画),实现页面加载时的div动画(不要用js控制,因为当页面加载的时候,js还不一定可以使用) 可以在https://daneden.github.io/animate.cs ...

  2. docker 部署disconf 以及将其做成镜像

    1.需要一台服务器(阿里云,腾讯云.实体服务器都行,本次是以实体服务器为依照做的) 2.安装docker   https://www.cnblogs.com/shijunjie/p/10436293. ...

  3. vm12下Centos6安装mysql5.7

    一.下载mysql的rpm tar文件 文件名称:mysql-5.7.18-1.el6.x86_64.rpm-bundle.tar官方地址:https://dev.mysql.com/get/Down ...

  4. Vue省市区三级联选择器V-Distpicker的使用

    Vue省市区三级联选择器V-Distpicker的使用 最近用的Vue+Element UI时,有些地方需要用到省市区三联选择器,网上安装并尝试了多种类似的插件,但都因为无法正常实现或是没有眼缘而弃用 ...

  5. maven入门与常用插件使用

    maven不仅仅是一款管理jar包的工具,还可以

  6. Blockly

    Blockly简介 A library for building visual programming editors.  Blockly 是个库,可用来构建可视化编程编辑器 Blockly is b ...

  7. temp脚本

    !/bin/bash source ${HOME_DIR}/script/ideploy_dm.inc source ${HOME_DIR}/script/comm_lib home_dir=$(cd ...

  8. tomcat局域网内发布html

    1. 保证tomcat装好,启动 验证:浏览器输入:localhost:8080,看到下面页面表示成功 2. 把html文件或包含html的目录拷到Apach安装目录的Root目录下 (例如:C:\P ...

  9. JavaScript 监听回车事件

    JS监听某个输入框 //回车事件绑定 $('#search_input').bind('keyup', function(event) { if (event.keyCode == "13& ...

  10. 2、按钮:Buttons

    /* --- page1.html ---*/ <ion-content padding class="page1"> <h1>基本用法,实体框</h ...