using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading; namespace GearUp.Crawler
{
public class Crawler
{
private ILoreBookItemRepository repository;
private ILorebookItemParser parser;
private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
{
this.repository = repository;
this.parser = parser;
this.linkManager = linkManager;
} public async void StartCrawl(string targetUrl)
{
var cts = new CancellationTokenSource();
var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 3,
MaxDegreeOfParallelism = 4,
BoundedCapacity = 10
}; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 2,
CancellationToken = ct
}; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup
downloader.LinkTo(contentBroadcaster);
contentBroadcaster.LinkTo(linkParser);
contentBroadcaster.LinkTo(writer);
linkParser.LinkTo(downloader); //Kick off the TPL dataflow here
downloader.Post(targetUrl);
WriteToConsole("Crawling...", ConsoleColor.Green);
PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
cts.Cancel();
WriteToConsole("Stopping...", ConsoleColor.Green);
await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
{
if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>();
var document = new LorebookDocument(page.Html);
foreach (var link in document.LinksInArticleBodyDiv())
{
var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
discoveredLinks.Add(fullUrl);
}
WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
return discoveredLinks;
} public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
{
WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
var itemDetails = document.OfficialLorebookEntry();
var item = parser.ParseHtmlNode(itemDetails, url);
return item;
} public async Task<PageAndUrl> DownloadUrl(string url)
{
try
{
if (urls.ContainsKey(url)) return null;
urls.TryAdd(url, true); var client = new WebClient();
WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
var download = client.DownloadStringTaskAsync(url);
var cancel = Task.Delay(DownloadTimeout * 1000);
var any = await Task.WhenAny(download, cancel);
if (any == cancel)
{
client.CancelAsync();
WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
return null;
}
string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result };
} catch (WebException ex)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
}
catch (AggregateException ex)
{
foreach (var exc in ex.Flatten().InnerExceptions)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
}
}
catch (Exception ex)
{
WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
} return null;
} public async Task SaveEntry(PageAndUrl page)
{
if (page == null) return;
var document = new LorebookDocument(page.Html);
var item = ExtractLoreBookItem(document, page.Url);
if (item != null) await repository.Save(page.Url, item);
} private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
{
Console.ForegroundColor = color;
Console.WriteLine(format, texts);
Console.ResetColor();
} private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
{
WriteToConsole(message, color);
if (key == null)
Console.ReadLine();
else
{
ConsoleKeyInfo entry;
do
{
entry = Console.ReadKey(true);
} while (key != entry.Key);
}
} }
}

tdf sample的更多相关文章

  1. Linux下UPnP sample分析

        一.UPnP简介   UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...

  2. cocos2d-x for android配置 & 运行 Sample on Linux OS

    1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...

  3. android studio2.2 的Find Sample Code点击没有反应

    1 . 出现的问题描述:           右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...

  4. jmeter(四)Sample之http请求

    启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...

  5. jcaptcha sample 制作验证码

    Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...

  6. Python 对不均衡数据进行Over sample(重抽样)

    需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...

  7. Basic linux command-with detailed sample

    Here I will list some parameters which people use very ofen, I will attach the output of the command ...

  8. 例子:RSS Reader Sample

    本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...

  9. 例子:Background Audio Streamer Sample

    The Background Audio Streamer sample demonstrates how to create an app that uses a MediaStreamSource ...

随机推荐

  1. codeforces 1101G (Zero XOR Subset)-less 前缀异或+线性基

    题目传送门 题意:给出一个序列,试将其划分为尽可能多的非空子段,满足每一个元素出现且仅出现在其中一个子段中,且在这些子段中任取若干子段,它们包含的所有数的异或和不能为0. 思路:先处理出前缀异或,这样 ...

  2. 【C语言】输入一个整数N,求N以内的素数之和

    [C语言]输入一个整数N,求N以内的素数之和 /* ========================================================================== ...

  3. TimesTen LINUX 安装日志

    $ ./setup.sh NOTE: Each TimesTen installation is identified by a unique instance name. The instance ...

  4. PIXI 下落文字消除(3)

    图片示例,简陋的图,记录下落过程, 1.创建应用实例并添加到DOM元素上. (会看到一个黑色画布,没有任何元素,接下来会在画布上创建文字) 2.创建  TextStyle 用来设置要显示字体样式 3. ...

  5. Lucene初识

    1.概述 1.1 Lucene是apache软件基金会4 jakarta项目组的一个子项目: 是一个开放源代码的全文检索引擎工具包: 但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了 ...

  6. shell 函数与内置变量

    1,特殊shell变量 $# 传递到脚本的参数个数 $* 以一个单字符串显示所有向脚本传递的参数 $$ 脚本运行的当前进程ID号 $! 后台运行的最后一个进程的ID号 $@ 与$*相同,但是使用时加引 ...

  7. 引导篇之HTTP事务

    一个完整的HTTP事务流图: HTTP报文格式: 起始行:在请求报文中用来说明要做些什么,在响应报文中说明出现了什么情况 首部:起始行后面有0个或多个首部字段.每个首部字段都包含一个名字和一个值,为了 ...

  8. 【python爬虫】用python编写LOL战绩查询

    介绍一个简单的python爬虫,通过Tkinter创建一个客户端,当输入要查询的LOL用户名称的时候,可以显示出当前用户的所在服务器,当前战力和当前段位. 爬取网页地址:http://lol.duow ...

  9. Linux查找命令与find命令详解

    一.文件查找之locate命令 locate :非实时的,查找时根据全系统文件数据库进行的,模糊查找,update 手动生成文件数据库速度快 依赖于updatedb数据库 1 2 3 4 5 6 7 ...

  10. bzoj 5308: [Zjoi2018]胖

    Description Cedyks是九条可怜的好朋友(可能这场比赛公开以后就不是了),也是这题的主人公. Cedyks是一个富有的男孩子.他住在著名的ThePLace(宫殿)中. Cedyks是一个 ...