Dooioo Deal
using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Dooioo : AnalyzerBase
{
protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
DoPerPaging(current, dom.DocumentNode, ".pagination a:last-child"); foreach (var node in QueryNodes(dom.DocumentNode, "#hlist a"))
{
var url = GetHref(node, current.Url);
Crawler.PushUrl(url, DataDepth.Houses);
}
}
break;
case DataDepth.Houses:
{
var dom = lander.GetDocument(pHandler);
var attrs = new AttributeFiller(); var Nset = QueryNodes(dom.DocumentNode, "#building-info li").Select(p =>
{
var spans = QueryTexts(p, "span").ToArray();
return string.Format("{0}:{1}", spans[], spans[]);
});
attrs.Append(Nset); Guid hashKey = GenHashKey(current.Url.OriginalString);
var bo = Crawler.Repository.LoadHouses(hashKey);
bo.SiteID = current.Url.GetDomain();
bo.PageUrl = current.Url.OriginalString;
bo.CityName = Crawler.Config.CityName;
attrs.FillEntity(bo, new Dictionary<string, string>()
{
{"小区名", "小区名称"},
{"板块", "所属区域"},
{"建造年代", "竣工时间"},
{"地址", "小区地址"},
{"物业类型", "物业类别"},
});
MapMark(bo);
Repository.Save(bo);
Crawler.OutWrite("保存楼盘 {0}", bo.小区名称); var Pset = QueryNodes(dom.DocumentNode, ".pagination a", false);
if (Pset.Any())
{
string pageCount = Pset.Skip(Pset.Count() - ).First().InnerText;
Crawler.PushUrl(new Uri(string.Format("http://www.dooioo.com/ershoufang/s117862?p=[2-{0}]", pageCount)), DataDepth.Deal, bo.RowID);
}
SaveHouselisting(bo.RowID, current, dom);
}
break;
case DataDepth.Deal:
{
Guid housesID = (Guid)current.State;
pHandler.CrossLoad = (arg, xDom) =>
{
string pName = "p";
if (arg.IsRepost)
{
arg.IsRepost = false;
return;
}
var query = System.Web.HttpUtility.ParseQueryString(arg.RequestUrl.Query);
int pageIndex;
if (!int.TryParse(query[pName], out pageIndex))
{
pageIndex = ;
} var input = xDom.GetElementsByTagName("ul").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.GetAttribute("class").Contains("pagination")).FirstOrDefault();
if (input == null)
{
App.LogInfo("CrossLoad xPaing:{0} {1}", this.GetType().Name, xDom.Body.InnerHtml);
return;
}
var btn = input.GetElementsByTagName("a").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.InnerText == pageIndex.ToString()).First();
btn.InvokeMember("click");
arg.IsRepost = true;
};
var dom = lander.GetDocument(pHandler);
SaveHouselisting(housesID, current, dom);
}
break;
}
} private void SaveHouselisting(Guid housesID, PageLandEntity current, HtmlAgilityPack.HtmlDocument dom)
{
var attrs = new AttributeFiller();
foreach (var node in QueryNodes(dom.DocumentNode, "#history-list tr"))
{
var spans = QueryTexts(node, "td").ToArray();
attrs.Append("HousesID:{0}", housesID); DateTime dump;
if (DateTime.TryParse(spans[], out dump))
{
attrs.Append("TransactionDate:{0}", dump);
} attrs.Append("SoldPriceOrRent:{0}", spans[]);
attrs.Append("UnitPriceOrLease:{0}", spans[]);
attrs.Append("Apartment:{0}", spans[]);
attrs.Append("ServiceBroker:{0}", spans[]);
attrs.Append("Area:{0}", spans[]); var bo = new HouselistingEntity();
attrs.FillEntity(bo);
Repository.SaveHouselisting(bo);
Crawler.OutWrite("保存小区出售记录 {0}", housesID);
}
}
}
}
Dooioo Deal的更多相关文章
- zlhome.com Deal
using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using ...
- XML节点名称中有小数点处理(deal with dot)导致使用xpath时报错解决方法
<?xml version="1.0"?> <ModifyFiles> <_Layout.cshtml>123456</_Layout.c ...
- whu 1464 deal with numbers
WHU 1464 deal with numbers 题意: 给你一串数字,对着串数字有三项操作: Minus a,b,c:对区间[a,b]总的每个数都减c. Division a,b,c:对区间[ ...
- OK335xS canutils deal with compile error
/************************************************************************************** * OK335xS ca ...
- 能让你聪明的工作DEAL四法则,来自《每周工作四小时》书籍
来自书籍<每周工作四小时>,作者蒂莫西·费里斯(Tim Ferriss,昵称:蒂姆) 能让你聪明的工作DEAL四法则: 第一步:D——定位(Definition) 第二步:E——精简( ...
- how to deal with EINTR fault
[how to deal with EINTR fault] EINTR:interupted error.是指一个调用被信号给中断,对于同步的耗时调用来说,这个操作常见,譬如select.read. ...
- Spoken English Practice( Believe it or not, I don't need to make believe its a big deal. (believe,deal, You don't say))
音标复习 绿色:连读:红色:略读:蓝色:浊化:橙色:弱读 口语蜕变(2017/6/25) Sorry, t ...
- If you want the rainbow, you have to deal with the rain.
If you want the rainbow, you have to deal with the rain.想要彩虹,就先忍受雨水.
- Using SMOTEBoost(过采样) and RUSBoost(使用聚类+集成学习) to deal with class imbalance
Using SMOTEBoost and RUSBoost to deal with class imbalance from:https://aitopics.org/doc/news:1B9F7A ...
随机推荐
- Centos6.7安装docker1.7.1
Docker当前发布的最新版本已经到了1.11,其官网上针对Centos的的安装需求如下: Docker requires a -bit installation regardless of your ...
- jQuery 简单过滤选择器
<!DOCTYPE HTML> <html> <head> <title> 使用jQuery基本过滤选择器 </title> <scr ...
- Tomcat配置文件之servlet.xml中选项介绍
Servlet.xml 分为以下元素: server, service, Connector ( 表示客户端和service之间的连接), Engine ( 表示指定service 中的请求处理机,接 ...
- Script循环语句 的相关知识跟练习
循环语句有两种问题类型:穷举和迭代 穷举: 在不知道什么情况下才是我们需要的结果的时候,只能让它一个一个的都执行一遍 迭代:在现有的条件下,根据规律,不断求解中间情况,最终推选出结果 两个关键词 br ...
- HTML5正确的嵌入flash
<object type="application/x-shockwave-flash" data="your-flash-file.swf" width ...
- 2016年12月24日 星期六 --出埃及记 Exodus 21:19
2016年12月24日 星期六 --出埃及记 Exodus 21:19 the one who struck the blow will not be held responsible if the ...
- Underscore.js 初探
一. 简介 Underscore 这个单词的意思是“下划线”. Underscore.js 是一个 JavaScript 工具库,提供了一整套的辅助方法供你使用. Think that - ...
- Spring并发访问的线程安全性问题
Spring并发访问的线程安全性问题 http://windows9834.blog.163.com/blog/static/27345004201391045539953/ 由于Spring MVC ...
- [渣译文] 使用 MVC 5 的 EF6 Code First 入门 系列:为ASP.NET MVC应用程序使用异步及存储过程
这是微软官方教程Getting Started with Entity Framework 6 Code First using MVC 5 系列的翻译,这里是第九篇:为ASP.NET MVC应用程序 ...
- 12.NFS搭建配置
参考博客:http://www.cnblogs.com/mchina/archive/2013/01/03/2840040.html 1.关闭防火墙和SELINUX $ service iptable ...