【C#爬虫】抓取XX网站mp4资源地址
抓取小视频的url地址,然后将地址信息拷贝到迅雷里批量下载就ok了
主程序 代码
//yazhouqingseAV 35
//zhifusiwaAV 29
//zipaishipin 30
//oumeiqingseAV 28
//katongdongman 31
//tongxingAV 32
//sanjidianying 33
//fengkuangqunjiao 34 var client = new WinHttpHelper();
var type = "fengkuangqunjiao";
var classid = ; for (int i = ; i > -; i++)
{
Console.WriteLine(i);
var index = "_" + i;
if (i == )
index = ""; string pageUrl = "http://www.lang34.com/se/" + type + "/index" + index + ".html"; var trs = RegexHelper.GetMathList(client.GET(pageUrl, Encoding.UTF8), "" + type + "/(.*?).html");
foreach (var item in trs)
{
string temp = "";
if (RegexHelper.GetMatchStr(item.ToString(), "" + type + "/(.*?).html", true, out temp))
{
string url = "http://www.lang34.com/e/DownSys/play/?classid=" + classid + "&id=" + temp + "&pathid=0";
string htmltext = client.GET(url, Encoding.UTF8); string mp4 = "";
if (RegexHelper.GetMatchStr(htmltext, "f:'(.*?)',", true, out mp4))
{
string titile = "";
RegexHelper.GetMatchStr(htmltext, " <title>(.*?)</title>", true, out titile); string output = mp4 + "?title" + titile + "\r\n";
Console.WriteLine(output);
File.AppendAllText("D://" + type + ".txt", output);
}
} }
}
网络请求类
using System;
using System.Collections.Generic;
using System.Text; namespace MyHelper4Web
{
public class WinHttpHelper
{
WinHttp.WinHttpRequest request; public string Accept = "*/*";
public string UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; InfoPath.2; .NET4.0E)";
public string ContentType = "application/json";// "application/x-www-form-urlencoded";
public int SetTimeOut = ;//请求超时时间秒
public bool AllowAutoRedirect = true;//是否允许自动跳转
public bool AllowHttpstoHttp = false;//是否允许http与https转换 public WinHttpHelper()
{
request = new WinHttp.WinHttpRequest();
} /// <summary>
/// 传入请求头的HttpHelper构造函数
/// </summary>
/// <param name="Accept">Accept</param>
/// <param name="UserAgent">UserAgent</param>
/// <param name="ContentType">ContentType</param>
public WinHttpHelper(string Accept, string UserAgent, string ContentType)
{
this.Accept = Accept;
this.UserAgent = UserAgent;
this.ContentType = ContentType;
} /// <summary>
/// 传入请求头的HttpHelper构造函数
/// </summary>
/// <param name="Accept">Accept</param>
/// <param name="UserAgent">UserAgent</param>
/// <param name="ContentType">ContentType</param>
/// <param name="SetTimeOut">SetTimeOut</param>
public WinHttpHelper(string Accept, string UserAgent, string ContentType, int SetTimeOut)
{
this.Accept = Accept;
this.UserAgent = UserAgent;
this.ContentType = ContentType;
this.SetTimeOut = SetTimeOut;
} /// <summary>
/// GET方式请求网页
/// </summary>
/// <param name="Url">请求的url</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] GET(string Url,string refer)
{
byte[] responsebody;
try
{
//不允许自动跳转
if (AllowAutoRedirect == false)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
}
//允许https与http转换
if (AllowHttpstoHttp == true)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
}
request.Open("GET", Url, true);
request.SetRequestHeader("Accept", Accept);
request.SetRequestHeader("User-Agent", UserAgent);
if (!string.IsNullOrEmpty(refer))
{
request.SetRequestHeader("Referer", refer);
}
request.Send("");
request.WaitForResponse(SetTimeOut);
responsebody = (byte[])request.ResponseBody;
}
catch (Exception ex)
{
responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
////LogHelper.Log.Error("GET方式请求网页异常", ex);
}
return responsebody;
} /// <summary>
/// GET方式请求网页
/// </summary>
/// <param name="Url">请求的url</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string GET(string Url, Encoding Encode)
{
string htmltext = "";
try
{
byte[] htmlbyte = GET(Url,"");
htmltext = Encode.GetString(htmlbyte);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("GET方式请求网页异常", ex);
}
return htmltext;
} public string GET(string Url,string refer , Encoding Encode)
{
byte[] htmlbyte = GET(Url, refer); return Encode.GetString(htmlbyte);
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Refer">Refer</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] POST(string Url, string PostData, string Refer)
{
byte[] responsebody;
try
{
//不允许自动跳转
if (AllowAutoRedirect == false)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
}
//允许https与http转换
if (AllowHttpstoHttp == true)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
}
request.Open("POST", Url, true);
request.SetRequestHeader("Accept", Accept);
request.SetRequestHeader("User-Agent", UserAgent);
request.SetRequestHeader("Content-Type", ContentType);
if (!string.IsNullOrEmpty(Refer))
{
request.SetRequestHeader("Referer", Refer);
}
request.Send(PostData);
request.WaitForResponse(SetTimeOut);
responsebody = (byte[])request.ResponseBody;
}
catch (Exception ex)
{
responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return responsebody;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] POST(string Url, string PostData)
{
byte[] responsebody;
responsebody = POST(Url, PostData, "");
return responsebody;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Refer">Refer</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string POST(string Url, string PostData, string Refer, Encoding Encode)
{
string htmltext = string.Empty;
try
{
byte[] responsebody = POST(Url, PostData, Refer);
htmltext = Encode.GetString(responsebody);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return htmltext;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string POST(string Url, string PostData, Encoding Encode)
{
string htmltext = string.Empty;
try
{
byte[] responsebody = POST(Url, PostData, "");
htmltext = Encode.GetString(responsebody);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return htmltext;
} public string GetAllCookis()
{
string cookis = "";
try
{
cookis = request.GetAllResponseHeaders();
}
catch (Exception)
{
return "";
}
return cookis;
}
}
}
正则表达式类
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections; namespace MyHelper4Web
{
public class RegexHelper
{
/// <summary>
///
/// </summary>
/// <param name="htmltext"></param>
/// <param name="pattern"></param>
/// <param name="isCut"></param>
/// <param name="result"></param>
/// <returns></returns>
public static bool GetMatchStr(string htmltext, string pattern, bool isCut, out string result)
{
bool IsGetSuccess = false;
result = "";
try
{
IsGetSuccess = GetMatchStr(htmltext, pattern, out result);
if (!isCut)
{
string[] replaceStrs = new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "|");
replaceStrs = splitStr.Split('|');
}
result = replaceStrs[] + result + replaceStrs[];
}
}
catch (Exception ex)
{
IsGetSuccess = false;
} return IsGetSuccess;
} public static string GetMatchString(string htmltext, string pattern, bool isCut)
{
string result = "";
try
{
GetMatchStr(htmltext, pattern, out result);
if (isCut)
{
string[] replaceStrs = new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "|");
replaceStrs = splitStr.Split('|');
}
result = result.Replace(replaceStrs[], "").Replace(replaceStrs[], "");
}
return result;
}
catch (Exception ex)
{
return "";
} } /// <summary>
/// 正则表达式dan匹配方法
/// </summary>
/// <param name="htmltext">网页内容</param>
/// <param name="pattern">模式字符串</param>
/// <param name="result">返回匹配成功的字符串</param>
/// <returns>匹配是否成功</returns>
public static bool GetMatchStr(string htmltext, string pattern, out string result)
{
bool IsGetSuccess = false;
result = "";
try
{
string[] replaceStrs=new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "^");
replaceStrs = splitStr.Split('^');
}
Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
Match match = regex.Match(htmltext);
if (match.Success)
{
result = match.ToString();
result = result.Replace(replaceStrs[], "").Replace(replaceStrs[], "");
}
else
{
IsGetSuccess = false;
}
}
catch (Exception ex)
{
IsGetSuccess = false;
}
finally
{
if (!string.IsNullOrEmpty(result))
{
IsGetSuccess = true;
}
else
{
IsGetSuccess = false;
}
}
return IsGetSuccess;
} /// <summary>
/// 正则多匹配,返回匹配ArrayList数组
/// </summary>
/// <param name="htmltext">网页内容</param>
/// <param name="pattern">模式字符串</param>
/// <returns></returns>
public static ArrayList GetMathList(string htmltext, string pattern)
{
ArrayList list = new ArrayList();
try
{
MatchCollection mc;
//定义一个Regex对象实例
Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
//或者多行匹配模式RegexOptions.Multiline
mc = regex.Matches(htmltext);
//在输入字符串中找到所有匹配
for (int i = ; i < mc.Count; i++)
{
//匹配一条信息就处理
string groupcode = mc[i].Value.ToString();
//处理函数
list.Add(groupcode);
}
}
catch (Exception)
{
return null;
}
return list;
} ///// <summary>
///// 正则表达式duo匹配方法
///// </summary>
///// <param name="htmltext">网页内容</param>
///// <param name="patterns">模式字符串数组</param>
///// <param name="result">返回匹配成功的字符串</param>
///// <returns>匹配是否成功</returns>
//public static bool GetMathStr(string htmltext, string[] patterns, out string result)
//{
// bool IsGetSuccess = false;
// result = "";
// try
// {
// string temp = htmltext;
// for (int i = 0; i < patterns.Length; i++)
// {
// Regex regex = new Regex(patterns[i], RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Match match = regex.Match(temp);
// if (match.Success)
// {
// temp = match.ToString();
// if (i == patterns.Length - 1)
// {
// result = temp;
// }
// }
// else
// {
// break;
// }
// }
// }
// catch (Exception ex)
// {
// IsGetSuccess = false;
// }
// finally
// {
// if (!string.IsNullOrEmpty(result))
// {
// IsGetSuccess = true;
// }
// else
// {
// IsGetSuccess = false;
// }
// }
// return IsGetSuccess;
//}
}
}
【C#爬虫】抓取XX网站mp4资源地址的更多相关文章
- python爬虫 抓取一个网站的所有网址链接
sklearn实战-乳腺癌细胞数据挖掘 https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campai ...
- python爬虫--爬取某网站电影下载地址
前言:因为自己还是python世界的一名小学生,还有很多路要走,所以本文以目的为向导,达到目的即可,对于那些我自己都没弄懂的原理,不做去做过多解释,以免误人子弟,大家可以网上搜索. 友情提示:本代码用 ...
- 一个简单的scrapy爬虫抓取豆瓣刘亦菲的图片地址
一.第一步是创建一个scrapy项目 sh-3.2# scrapy startproject liuyifeiImage sh-3.2# chmod -R 777 liuyifeiImage/ 二.分 ...
- 爬虫抓取页面数据原理(php爬虫框架有很多 )
爬虫抓取页面数据原理(php爬虫框架有很多 ) 一.总结 1.php爬虫框架有很多,包括很多傻瓜式的软件 2.照以前写过java爬虫的例子来看,真的非常简单,就是一个获取网页数据的类或者方法(这里的话 ...
- python 爬虫抓取心得
quanwei9958 转自 python 爬虫抓取心得分享 urllib.quote('要编码的字符串') 如果你要在url请求里面放入中文,对相应的中文进行编码的话,可以用: urllib.quo ...
- C# 爬虫 抓取小说
心血来潮,想研究下爬虫,爬点小说. 通过百度选择了个小说网站,随便找了一本小书http://www.23us.so/files/article/html/13/13655/index.html. 1. ...
- Java 实现 HttpClients+jsoup,Jsoup,htmlunit,Headless Chrome 爬虫抓取数据
最近整理一下手头上搞过的一些爬虫,有HttpClients+jsoup,Jsoup,htmlunit,HeadlessChrome 一,HttpClients+jsoup,这是第一代比较low,很快就 ...
- PID控制器的应用:控制网络爬虫抓取速度
一.初识PID控制器 冬天乡下人喜欢烤火取暖,常见的情形就是四人围着麻将桌,桌底放一盆碳火.有人觉得火不够大,那加点木炭吧,还不够,再加点.片刻之后,又觉得火太大,脚都快被烤熟了,那就取出一些木碳…… ...
- Python爬虫抓取东方财富网股票数据并实现MySQL数据库存储
Python爬虫可以说是好玩又好用了.现想利用Python爬取网页股票数据保存到本地csv数据文件中,同时想把股票数据保存到MySQL数据库中.需求有了,剩下的就是实现了. 在开始之前,保证已经安装好 ...
随机推荐
- Android学习笔记:如何设置ImageView中图片的显示方式
我们在用ImageView显示图片时,很多情况下图片的大小与ImageView的尺寸不是完全一样的.这时就涉及到该如何设置显示图片了. ImageView有个重要的属性是ScaleType,该属性用以 ...
- IPTV中的EPG前端优化
先看一下IPTV相关情况: l 目前TPTV市场情况 a) 截止今年2月,全国IPTV总用户数达3630.2万,我国移动互联网用户规模接近9亿,人均月接入量近300M,8M宽带达半数,光纤近4成. 图 ...
- java.sql.SQLException: ORA-00604: 递归 SQL 级别 1 出现错误
后台报出如下错误: Caused by: java.sql.SQLException: ORA-00604: 递归 SQL 级别 1 出现错误 ORA-01000: 超出打开游标的最大数 ORA-00 ...
- C#方法的使用
static void Main(string[] arr) { , ); Console.WriteLine(max); Console.ReadKey(); } /// <summary&g ...
- php中iconv函数的一个小bug--转载
iconv转换字符集很好用,但是有时候你会发现iconv转换的时候会返回false或者空字符串,严格说来这算不上是iconv的问题,这其实是字符集的问题,但是实际编码中应该算是iconv的bug了. ...
- Convert String to Long
问题: Given a string, write a routine that converts the string to a long, without using the built in f ...
- vs2008下使用libcurl
网上找了半天,总算找到一个比较好用的C++ 网络库,老实说,完全用Socket操作网络对于需要开发网络应用程序的人员来说还是很蛋疼很繁琐的.好在有这么一个给力的库.这个库的介绍可以自己百度一下,就我所 ...
- nodejs http.get乱码问题处理方法
var req = http.get(url,function(res){ res.setEncoding('utf-8'); var html = '' res.on('data',function ...
- smarty 中时间格式化的用法
大家都知道PHP中输出时间和日期可以用 date("Y-m-d H:i:s",时间戳) , 但是在smarty模板中,$time|date_format:'%Y-%m-%d %H ...
- php 编译安装curl 时候出现问题
/usr/bin/ld: ext/curl/.libs/interface.o: undefined reference to symbol 'CRYPTO_set_id_callback@@OPEN ...