using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Soufun_News : AnalyzerBase
{
private enum Kind
{
[Description("市场")]
Market = ,
[Description("政策")]
Policy = ,
[Description("公司")]
Company = ,
} private static readonly string[] FilterTags = new string[] { "script", "iframe" }; public override void Init(PageCrawler crawler)
{
string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
crawler.PushUrl(new StringPatternGenerator(exp), );
base.Init(crawler);
} protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
dynamic repository = Repository;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
{
var linkNode = QueryNode(node, "a.link_01");
string url = GetHref(linkNode, current.Url).OriginalString;
int i = url.LastIndexOf(".");
Crawler.PushUrl(new Uri(url.Insert(i, "_all")), );
}
}
break;
case :
{
var dom = lander.GetDocument(pHandler);
var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
string kind = QueryNodes(hackNode, "a").Last().InnerText;
string title = QueryNode(dom.DocumentNode, "h1").InnerText;
var contentNode = QueryNode(dom.DocumentNode, "#news_body");
foreach (string tag in FilterTags)
{
foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
{
node.Remove();
}
}
var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take().ToArray();
string source = null;
DateTime publishDate;
DateTime.TryParse(set[].InnerText, out publishDate);
if (set.Length == )
{
source = set[].InnerText;
}
repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
Crawler.OutWrite("保存新闻 {0}", title);
}
break;
}
}
}
}
        public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
{
Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
using (var db = Create())
{
var q = from t in db.News
where t.RowID == rowID
select t;
var news = q.SingleOrDefault();
if (news == null)
{
db.News.Add(news = new News()
{
RowID = rowID,
SiteID = pageUrl.Authority,
});
}
news.Kind = kind;
news.Source = source;
news.Title = title;
news.Content = content;
news.PublishDate = publishDate;
db._SaveChanges();
}
}

Soufun_News的更多相关文章

随机推荐

  1. url结构说明

    就以下面这个URL为例,介绍下普通URL的各部分组成 http://www.aspxfans.com:8080/news/index.asp?boardID=5&ID=24618&pa ...

  2. imail 删除历史邮件命令

    删除旧的邮件(immsgexp.exe)Immsgexp.exe 可以让管理员删除指定天数的旧的邮件.基本语法 immsgexp -t startdirectory -d #of_days_to_sa ...

  3. KTV项目总结

    KTV项目总结 大约一个星期前吧,老湿说我们要开始做KTV项目了,说是KTV项目是贯穿整个学的内容的,会所的,要我们认真去对待,一开始,第一天搭前台界面,总是有不会的,要去问问,这个要用什么控件啊,用 ...

  4. [问题2014S07] 复旦高等代数II(13级)每周一题(第七教学周)

    [问题2014S07]  设 \(A\in M_n(\mathbb{K})\) 在数域 \(\mathbb{K}\) 上的初等因子组为 \(P_1(\lambda)^{e_1},P_2(\lambda ...

  5. linux passwd文件解析

    #cat/etc/passwd root:x:::Superuser:/: daemon:x:::Systemdaemons:/etc: bin:x:::Ownerofsystemcommands:/ ...

  6. centos7上consul的集群安装

    centos7上consul的安装 ###一 下载 下载文件 wget https://releases.hashicorp.com/consul/0.6.4/consul_0.6.4_linux_a ...

  7. iOS开发数据库篇—SQLite简单介绍

    iOS开发数据库篇—SQLite简单介绍 一.离线缓存 在项目开发中,通常都需要对数据进行离线缓存的处理,如新闻数据的离线缓存等. 说明:离线缓存一般都是把数据保存到项目的沙盒中.有以下几种方式 (1 ...

  8. Nginx模块之————RTMP模块在Ubuntu上以串流直播HLS视频

    Nginx的安装在Ubuntu上以串流直播HLS视频 https://www.vultr.com/docs/setup-nginx-on-ubuntu-to-stream-live-hls-video

  9. mysql中Timestamp,time,datetime 区别

    一.TIMESTAMP[(M)] 时间戳.范围是’1970-01-01 00:00:00’到2037年. TIMESTAMP列用于INSERT或UPDATE操作时记录日期和时间. 如果你不分配一个值, ...

  10. codeforces 446A DZY Loves Sequences

    vjudge 上题目链接:codeforces 446A 大意是说最多可以修改数列中的一个数,求最长严格递增的连续子序列长度. 其实就是个 dp 的思想,想好思路后交上去没想到一直 wa 在第二个测试 ...