csharp: using HtmlAgilityPack and ScrapySharp reading Url find text
https://github.com/exaphaser/ScrapySharp
https://github.com/zzzprojects/html-agility-pack
https://github.com/atifaziz/Fizzler
https://archive.codeplex.com/?p=fizzlerex
https://github.com/aspnet/blazor
https://github.com/SteveSanderson/Blazor
https://www.mathjax.org/#samples 数学公式
https://github.com/Ivony/Jumony
https://github.com/GeReV/NSoup
https://github.com/robinvanderknaap/MvcJqGrid
http://www.defenseinnovationmarketplace.mil/strategy.html
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Collections;
using ScrapySharp;
using ScrapySharp.Network;
using ScrapySharp.Core;
using HtmlAgilityPack; namespace HtmlAgilityPackDemo
{ /// <summary>
/// HTML解析利器HtmlAgilityPack
/// geovindu
/// 涂聚文
/// 20180305
/// </summary>
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void Form1_Load(object sender, EventArgs e)
{
this.textBox1.Text = "ln"; //List<CityList> lis=new List<CityList>();
}
/// <summary>
///
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetWebClient(string url)
{
string strHTML = "";
WebClient myWebClient = new WebClient();
Stream myStream = myWebClient.OpenRead(url);
StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码
strHTML = sr.ReadToEnd();
myStream.Close();
return strHTML;
} /// <summary>
/// nl
/// </summary>
/// <param name="cityCode"></param>
public string ParsePageByArea(String cityCode, out List<CityList> listcity)
{
StringBuilder stp = new StringBuilder();
CityList city = null;
List<CityList> clits = new List<CityList>();
//更加链接格式和省份代码构造URL
String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode);
//下载网页源代码
var docText = GetWebClient(url);
//加载源代码,获取文档对象
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(docText);
//更加xpath获取总的对象,如果不为空,就继续选择dl标签
var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]");
if (res != null)
{
var list = res.SelectNodes(@"dl");//选择标签数组
if (list.Count < 1)
{
listcity = clits;
return "";
}
foreach (var item in list)
{
var dd = item.SelectSingleNode(@"dd").SelectNodes("a");
foreach (var node in dd)
{
city = new CityList();
var text = node.InnerText.Trim();
//拼音代码要从href属性中进行分割提取
var herf = node.Attributes["href"].Value.Trim().Split('/', '.');
string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]);
city.CityName = text;
city.CityCode = herf[herf.Length - 2];
stp.Append("\r\n" + str);
clits.Add(city); }
}
}
listcity = clits;
return stp.ToString();
}
/// <summary>
/// http://www.tianqihoubao.com/lishi/dalian/month/201802.html
/// </summary>
/// <param name="cityCode"></param>
/// <param name="year"></param>
/// <param name="month"></param>
public string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea)
{
StringBuilder stp = new StringBuilder();
List<WeatherList> wlist = new List<WeatherList>();
WeatherList wt = null;
//更加拼音代码,月份信息构造URL
String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month);
//获取该链接的源代码
var docText = GetWebClient(url);
//加载源代码,获取页面结构对象
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(docText);
//更加Xpath获取表格对象
var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]");
if (res != null)
{
//获取所有行
var list = res.SelectNodes(@"tr");
list.RemoveAt(0);//移除第一行,是表头
// 遍历每一行,获取日期,以及天气状况等信息
foreach (var item in list)
{
wt = new WeatherList();
var dd = item.SelectNodes(@"td");
//日期 - - 气温 - 风力风向
if (dd.Count != 4) continue;
//获取当前行日期
var date1 = dd[0].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行天气状况
var tq = dd[1].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行气温
var qw = dd[2].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//获取当前行风力风向
var fx = dd[3].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
//输出
string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx);
stp.Append(str);
wt.Climate = tq;
wt.Date =DateTime.Parse(date1);
wt.Temperature = qw;
wt.WindDirection = fx;
wlist.Add(wt); }
}
wea = wlist;
return stp.ToString();
}
/// <summary>
/// http://www.dusystem.com/geovindu.html
/// ScrapingBrowser
/// 获取文件标题
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public string getHtmlTitle(string url)
{
StringBuilder titl = new StringBuilder();
var uri = new Uri(url);
var browser1 = new ScrapingBrowser();
var html1 = browser1.DownloadString(uri);
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html1);
var html = doc.DocumentNode; var title = html.SelectNodes("title");
foreach (var htmlNode in title)
{
titl.Append(htmlNode.InnerText);
}
//CssSelect CssSelectAncestors
var ps = html.SelectNodes("p").Elements("div#endText");
foreach (var htmlNode in ps)
{
titl.Append(htmlNode.InnerHtml);
} return titl.ToString(); }
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click(object sender, EventArgs e)
{
List<CityList> list = new List<CityList>();
this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list);
this.comboBox1.DataSource = list;
this.comboBox1.DisplayMember = "CityName";
this.comboBox1.ValueMember = "CityCode"; }
/// <summary>
///
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button2_Click(object sender, EventArgs e)
{
List<WeatherList> list = new List<WeatherList>();
int year=DateTime.Now.Year;
int mont=DateTime.Now.Month-1;
this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list);
this.dataGridView1.DataSource = list; } }
/// <summary>
///
/// </summary>
public class CityList
{
/// <summary>
///
/// </summary>
public string CityName { get; set; }
/// <summary>
///
/// </summary>
public string CityCode { get; set; }
} /// <summary>
/// Climate, temperature, wind direction
/// </summary>
public class WeatherList
{
/// <summary>
/// 气候
/// </summary>
public string Climate { get; set; }
/// <summary>
/// 温度
/// </summary>
public string Temperature { get; set; }
/// <summary>
/// 风向
/// </summary>
public string WindDirection { get; set; }
/// <summary>
///
/// </summary>
public DateTime Date { get; set; }
} }
private void button3_Click(object sender, EventArgs e)
{
int year = DateTime.Now.Year;
int mont = DateTime.Now.Month - 1;
string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html";
var docText = GetWebClient(url);
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(docText); // document.OptionOutputAsXml = true; var divname = document.DocumentNode.Descendants("div").FirstOrDefault(); var body = document.DocumentNode.SelectNodes("//body").Single(); var ta = document.DocumentNode.SelectNodes("//table").Single(); foreach (var script in document.DocumentNode.Descendants("script").ToArray())
script.Remove();
foreach (var style in document.DocumentNode.Descendants("style").ToArray())
style.Remove(); // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray())
// comment.Remove();//新增的代码 //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li");
//是示天气的
List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList(); string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value; // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name
//XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1]
HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr"); //b: is class name 根据XPath查找节点,跟XmlNode差不多 HtmlNode node = document.DocumentNode.SelectSingleNode("//*"); IEnumerable<HtmlNode> nodeList = node.Ancestors(); //获取该元素所有的父节点的集合
foreach (HtmlNode item in nodeList)
{
Console.Write(item.Name + " "); //输出 div div body html #document
} HtmlAttributeCollection attrs = node.Attributes;
foreach (var item in attrs)
{
Console.WriteLine(item.Name + " : " + item.Value); //输出 class :user_match clear
} HtmlNodeCollection CNodes = node.ChildNodes; //所有的子节点
foreach (HtmlNode item in CNodes)
{
Console.WriteLine(item.Name + "-" + item.InnerText); //输出 别忘了文本节点也算
} HtmlAttributeCollection attrs1 = node.ClosingAttributes; //获取在结束标记的 HTML 属性的集合。 例如</ul class="">
Console.WriteLine(attrs1.Count); //输出0 HtmlNode node1 = node.FirstChild; //悲剧了ul的第一个节点是一个 \n 换行文本节点 第二个节点才到第一个li
Console.WriteLine(node1.NodeType); //输出Text 文本节点
HtmlNode node3 = node.LastChild; //同样最后一个节点一样是 \n 文本节点
Console.WriteLine(node3.NodeType); //输出Text 文本节点 HtmlNode node2 = node.SelectSingleNode("child::div[1]"); //获取当前节点的第一个子li节点
Console.WriteLine(node2.XPath); //根据节点生成XPath表达式 /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] Console.WriteLine(node.HasAttributes); //输出 True 判断节点是否含有属性
Console.WriteLine(node.HasChildNodes); //输出 True 判断节点是否含有子节点
Console.WriteLine(node.HasClosingAttributes); //False 判断节点结束标记是否含有属性 Console.WriteLine(node.Line); //输出 155 该节点开始标记位于页面代码的第几行
Console.WriteLine(node.LinePosition); //输出 1 该节点开始标记位于第几列2
Console.WriteLine(node.NodeType); //输出 Element 该节点类型 此处为元素节点
Console.WriteLine(node.OriginalName); //输出 ul
HtmlNode node4 = node.SelectSingleNode("child::div[1]");
Console.WriteLine(node4.InnerText); //输出
HtmlNode node5 = node4.NextSibling.NextSibling; //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点
Console.WriteLine(node5.InnerText); //输出
HtmlNode node6 = node5.PreviousSibling.PreviousSibling; //同样两次以跳过换行文本节点
Console.WriteLine(node6.InnerText); //输出
HtmlNode node7 = node6.ParentNode; //获取父节点
Console.WriteLine(node7.Name); //输出 ul
string str = node.OuterHtml;
Console.WriteLine(str); //输出整个ul代码class="user_match clear">
Console.WriteLine(node.StreamPosition); //输出7331 获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。 HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument; foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div"))
{
var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value; if (classValue == "first")
{
//write innerText into a table at place [i][column1]
}
else if (classValue == "second")
{
//write innerText into the same table in [i][column2]
}
} string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText;
}
csharp: using HtmlAgilityPack and ScrapySharp reading Url find text的更多相关文章
- 爬虫技术 -- 进阶学习(十)网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp)
最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决定试一试~ 于是到https://www.nuget.org/packages/Scrapy ...
- HtmlAgilityPack搭配 ScrapySharp或HtmlAgilityPack.CssSelectors
Html Agility Pack 源码中的类大概有28个左右,其实不算一个很复杂的类库,但它的功能确不弱,为解析DOM已经提供了足够强大的功能支持,可以跟jQuery操作DOM媲 美:)Html A ...
- 网易新闻页面信息抓取 -- htmlagilitypack搭配scrapysharp
最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决定试一试~ 于是到https://www.nuget.org/packages/Scrapy ...
- 网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp)
转自原文 网易新闻页面信息抓取(htmlagilitypack搭配scrapysharp) 最近在弄网页爬虫这方面的,上网看到关于htmlagilitypack搭配scrapysharp的文章,于是决 ...
- c#中的解析HTML组件 -- (HtmlAgilityPack,Jumony,ScrapySharp,NSoup,Fizzler)
做数据抓取,网络爬虫方面的开发,自然少不了解析HTML源码的操作.那么问题来了,到底.NET如何来解析HTML,有哪些解析HTML源码的好用的,有效的组件呢? 作者在开始做这方面开发的时候就被这些 ...
- 使用HtmlAgilityPack和ScrapySharp抓取网页数据遇到的几个问题解决方法——格式编码问题
需要用到对应市区县街道居委会的区域编码,于是找到统计局的网页,对这些数据进行抓取,用到了HtmlAgilityPack和ScrapySharp,由于也是第一次从网页抓取数据,所以对于HtmlAgili ...
- C#+HtmlAgilityPack+XPath带你采集数据(以采集天气数据为例子)
第一次接触HtmlAgilityPack是在5年前,一些意外,让我从技术部门临时调到销售部门,负责建立一些流程和寻找潜在客户,最后在阿里巴巴找到了很多客户信息,非常全面,刚开始是手动复制到Excel, ...
- Net处理html页面元素工具类(HtmlAgilityPack.dll)的使用
现在,在不少应用场合中都希望做到数据抓取,特别是基于网页部分的抓取.其实网页抓取的过程实际上是通过编程的方法,去抓取不同网站网页后,再进行 分析筛选的过程.比如,有的比较购物网站,会同时去抓取不同购物 ...
- C#:使用HtmlAgilityPack解析Html
推荐阅读: HtmlAgilityPack 入门教程1 HtmlAgilityPack入门教程2 向HtmlAgilityPack道歉:解析HTML还是你好用 获取html中meta标签中的conte ...
随机推荐
- SDWebImage之SDWebImageManager
SDWebImageManager是SDWebImage的核心类.它拥有一个SDWebImageCache和一个SDWebImageDownloader属性,分别用于图片的缓存和下载处理.虽然是核心类 ...
- vue高级组件之provide / inject
转载:https://blog.csdn.net/Garrettzxd/article/details/81407199 在vue中不同组件通信方式如下 1.父子组件,通过prop 2.非父子组件,通 ...
- java visualVM(jconsole)远程监控服务器java进程
1. JMX方式(jconsole也可通过此方式进行连接) jmx方式能监控到CPU信息,但无法使用visualVM的visualVM GC插件 jmx无密码方式 监控普通的java进程 . 设 ...
- myeclipise生成javadoc
1.点击项目,右键,选择export: 点击next: 点击next:VM options中输入-encoding UTF-8 -charset UTF-8
- 故事描述SVM----支持向量机/support vector machine (SVM)
作者:简之链接:https://www.zhihu.com/question/21094489/answer/86273196来源:知乎著作权归作者所有.商业转载请联系作者获得授权,非商业转载请注明出 ...
- 2019年19道java经典面试题(附答案)
1.不可变对象 指对象一旦被创建状态不能再改变.任何修改都会创建一个新的对象,如 String.Integer及其它包装类. 2.能否创建一个包含可变对象的不可变对象? 可以.不要共享可变对象的引用就 ...
- “玲珑杯”ACM比赛 Round #18---图论你先敲完模板(DP+思维)
题目链接 DESCRIPTION INPUT OUTPUT SAMPLE INPUT 2 3 2 3 5 7 3 10 3 5 7 SAMPLE OUTPUT 12 26 HINT 官方题解: 代码如 ...
- 14-02 Java Math类,Random类,System类,BigDecimal类
Math类 Math的方法 package cn.itcast_01; /* * Math:用于数学运算的类. * 成员变量: * public static final double PI * pu ...
- asp.net core的docker实践
如果centos中没有安装和docker和.net core镜像,先安装docker和asp.net core 镜像 安装dockeryum -y install docker-io 启动 Docke ...
- spring载入外部配置文件的方法
<bean class=“org.springframework.beans.factory.config.PropertyPlaceholderConfigurer“> <prop ...