html网页采集
UI_Less.pas:
unit UI_Less; interface uses
Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX; const
WM_USER_STARTWALKING = WM_USER + ;
DISPID_AMBIENT_DLCONTROL = (-);
READYSTATE_COMPLETE = $; DLCTL_DLIMAGES = $;
DLCTL_VIDEOS = $;
DLCTL_BGSOUNDS = $;
DLCTL_NO_SCRIPTS = $;
DLCTL_NO_JAVA = $;
DLCTL_NO_RUNACTIVEXCTLS = $;
DLCTL_NO_DLACTIVEXCTLS = $;
DLCTL_DOWNLOADONLY = $;
DLCTL_NO_FRAMEDOWNLOAD = $;
DLCTL_RESYNCHRONIZE = $;
DLCTL_PRAGMA_NO_CACHE = $;
DLCTL_NO_BEHAVIORS = $;
DLCTL_NO_METACHARSET = $;
DLCTL_URL_ENCODING_DISABLE_UTF8 = $;
DLCTL_URL_ENCODING_ENABLE_UTF8 = $;
DLCTL_FORCEOFFLINE = $;
DLCTL_NO_CLIENTPULL = $;
DLCTL_SILENT = $;
DLCTL_OFFLINEIFNOTCONNECTED = $;
DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED; type
TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink,
IOleClientSite)
private
FDocTitle: string;
FBodyText: TStrings;
FBodyHtml: TStrings;
protected
/// IDISPATCH
function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
stdcall;
/// IPROPERTYNOTIFYSINK
function OnChanged(DispID: TDispID): HResult; stdcall;
function OnRequestEdit(DispID: TDispID): HResult; stdcall;
/// IOLECLIENTSITE
function SaveObject: HResult; stdcall;
function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult; stdcall;
function GetContainer(out container: IOleContainer): HResult; stdcall;
function ShowObject: HResult; stdcall;
function OnShowWindow(fShow: BOOL): HResult; stdcall;
function RequestNewObjectLayout: HResult; stdcall;
///
function LoadUrlFromMoniker: HResult;
function LoadUrlFromFile: HResult;
// * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead. public
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
property DocTitle: string read FDocTitle;
property BodyText: TStrings read FBodyText write FBodyText;
property BodyHtml: TStrings read FBodyHtml write FBodyHtml;
function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings);
procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
end; implementation var
Doc: IhtmlDocument2;
_URL: PWidechar; constructor TUILess.Create(AOwner: TComponent);
begin
inherited Create(AOwner);
FBodyText := TStringList.Create;
FBodyHtml := TStringList.Create;
end; destructor TUILess.Destroy;
begin
if Assigned(FBodyText) then
FBodyText.Free;
if Assigned(FBodyHtml) then
FBodyHtml.Free;
inherited Destroy;
end; /// CORE ---->>>>>>>>>
function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
var
Cookie: Integer;
CP: IConnectionPoint;
OleObject: IOleObject;
OleControl: IOleControl;
CPC: IConnectionPointContainer;
All: IHTMLElement;
Msg: TMsg;
hr: HResult;
begin
_URL := URL;
IsSucceed := false;
try
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, Doc);
OleObject := Doc as IOleObject;
OleObject.SetClientSite(self);
OleControl := Doc as IOleControl;
OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
CPC := Doc as IConnectionPointContainer;
CPC.FindConnectionPoint(IPropertyNotifySink, CP);
CP.Advise(self, Cookie);
hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile;
if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then
while (GetMessage(Msg, , , )) do
begin
if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = )) then
begin
PostQuitMessage();
result := Doc.Get_all;
All := Doc.Get_body;
FDocTitle := string(Doc.nameProp);
FBodyText.Text := string(All.outerText);
FBodyHtml.Text := string(All.outerHTML);
IsSucceed := true;
end
else
DispatchMessage(Msg);
if IsStop then
Exit;
end;
except
Exit;
end;
end; function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
var
I: Integer;
begin
if DISPID_AMBIENT_DLCONTROL = DispID then
begin
I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA +
DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS;
PVariant(VarResult)^ := I;
result := S_OK;
end
else
result := DISP_E_MEMBERNOTFOUND;
end; function TUILess.OnChanged(DispID: TDispID): HResult;
var
dp: TDispParams;
vResult: OleVariant;
begin
if (DISPID_READYSTATE = DispID) then
if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null,
LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil))
then
if Integer(vResult) = READYSTATE_COMPLETE then
PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, , );
end; function TUILess.LoadUrlFromMoniker: HResult;
var
Moniker: IMoniker;
BindCtx: IBindCTX;
PM: IPersistMoniker;
begin
createURLMoniker(nil, _URL, Moniker);
CreateBindCtx(, BindCtx);
PM := Doc as IPersistMoniker;
result := PM.Load(LongBool(), Moniker, BindCtx, STGM_READ)
end; function TUILess.LoadUrlFromFile: HResult;
var
PF: IPersistfile;
begin
PF := Doc as IPersistfile;
result := PF.Load(_URL, );
end; // 获取图像链接
procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
var
Image: IHTMLImgElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then
ImageList.add(string(Image.src));
end;
end;
end; // 获取链接
procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection;
Anchorlist: TStrings);
var
anchor: IHTMLAnchorElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and
(anchor.href <> '')) then
Anchorlist.add(string(anchor.href));
end;
end;
end; /// Don't Care ------>>>>>>>>>>>
function TUILess.OnRequestEdit(DispID: TDispID): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.SaveObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetContainer(out container: IOleContainer): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.ShowObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.OnShowWindow(fShow: BOOL): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.RequestNewObjectLayout: HResult;
begin
result := E_NOTIMPL;
end; end.
Unit3.pas:
unit Unit3; interface uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls; type
TForm3 = class(TForm)
Button1: TButton;
Edit1: TEdit;
Memo1: TMemo;
Button2: TButton;
Button3: TButton;
Button4: TButton;
procedure Button1Click(Sender: TObject);
procedure Button2Click(Sender: TObject);
procedure Button3Click(Sender: TObject);
procedure Button4Click(Sender: TObject);
private
{ Private declarations }
procedure into(i: Word);
public
{ Public declarations }
end; var
Form3: TForm3; implementation
uses UI_Less; {$R *.dfm} function DoStrToWideChar(s: string): PWideChar;
var
// s:sting;
pwc: PWidechar;
len: integer;
begin
// s:= 'abcdefg ';
len := length(s) + ;
pwc := AllocMem(len * sizeof(widechar));
stringtowidechar(s, pwc, len);
// showmessage(widechartostring(pwc)); result := pwc;
// FreeMem(pwc);
end; procedure TForm3.into(i: Word);
var
sh: TUILess;
su: boolean; // 是否获取成功
// isstop: boolean; //设全局变量可以中断连接 ,避免出错
surl: PWideChar;
begin
surl := DoStrToWideChar(Trim(Edit1.Text));
sh := TUILess.Create(nil);
try
Memo1.Clear;
case i of
:
sh.GetAnchorList(sh.get(surl, su, False), Memo1.Lines);
:
sh.GetImageList(sh.get(surl, su, False), Memo1.Lines);
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyText;
end;
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyHtml;
end;
end;
finally
//sh.Free;
end;
end; procedure TForm3.Button1Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button2Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button3Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button4Click(Sender: TObject);
begin
into();
end; end.
html网页采集的更多相关文章
- Hawk 3. 网页采集器
1.基本入门 1. 原理(建议阅读) 网页采集器的功能是获取网页中的数据(废话).通常来说,目标可能是列表(如购物车列表),或是一个页面中的固定字段(如JD某商品的价格和介绍,在页面中只有一个).因此 ...
- Fiddler 网页采集抓包利器
最近这段时间,网页采集方面的工作做得比较多.用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示:基于weiphp做了一个掌上 ...
- Fiddler 网页采集抓包利器__手机app抓包
用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示: 基于weiphp做了一个掌上头条插件,也是用的网页采集技术:和一个创业 ...
- 网页采集利器 phpQuery
网页采集利器 phpQuery 2012-02-28 11:43:24| 分类: php|举报|字号 订阅 在网页采集的时候,通常都会用到正则表达式.但是有时候对于正则不太好的同学,比如我, ...
- 网页采集器-UA伪装
网页采集器-UA伪装 UA伪装 请求载体身份标识的伪装: User-Agent: 请求载体身份标识,通过浏览器发起的请求,请求载体为浏览器,则该请求的User-Agent为浏览器的身份标识,如果使用爬 ...
- 异步网页采集利器CasperJs
在采集网页中,我们会经常遇到采集一些异步加载页面的网页,我们通常用的httpwebrequest类就采集不到了,这个时候我们通常会采用webbrowser来辅助采集,但是.net下自带的webbrow ...
- 简单的网页采集程序(ASP.NET MVC4)
因为懒人太多,造成现在网页数据采集非常的流行,我也来写个简单的记录一下. 之前写了MVC的基本框架的搭建随笔,后面因为公司太忙,个人感情问题:(,导致不想写了,就写了两篇给删除了,现在就搁浅了, 本人 ...
- 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】
做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...
- C#网页采集数据的几种方式(WebClient、WebBrowser和HttpWebRequest/HttpWebResponse)
一.通过WebClient获取网页内容 这是一种很简单的获取方式,当然,其它的获取方法也很简单.在这里首先要说明的是,如果为了实际项目的效率考虑,需要考虑在函数中分配一个内存区域.大概写法如下 //M ...
- 网页采集(通过HtmlAgilityPack+XPath)
有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...
随机推荐
- cache 基本原理
Cache 主要由 Cache Tag,Cache 存储体,Cache 控制模块组成.Cache Tag 主要用来记录 Cache 存储体中数据的位置和判断 Cache 内数据是否命中: Ca ...
- 洛谷 P1352 没有上司的舞会【树形DP】(经典)
<题目链接> <转载于>>> > 题目描述: 某大学有N个职员,编号为1~N.他们之间有从属关系,也就是说他们的关系就像一棵以校长为根的树,父结点就是子结点的 ...
- HDU 1358 Period (kmp求循环节)(经典)
<题目链接> 题目大意: 意思是,从第1个字母到第2字母组成的字符串可由某一周期性的字串(“a”) 的两次组成,也就是aa有两个a组成: 第三行自然就是aabaab可有两个aab组成: 第 ...
- hdu 1272 小希的迷宫【并查集】
<题目链接> 小希的迷宫 Problem Description 上次Gardon的迷宫城堡小希玩了很久(见Problem B),现在她也想设计一个迷宫让Gardon来走.但是她设计迷宫的 ...
- DataTable保存为Excel或者Txt
保存为txt的时候,可保持原来的行列对齐,如下:using System; using System.Collections.Generic; using System.Text; using Sys ...
- Bootstrsp 表单
摘要:bootstrap提供了三种表单布局:垂直表单,内联表单和水平表单.本文主要对这三种表单布局做一下基础的介绍,以及对表单控件做简要概括. 1.垂直表单 垂直表单也称基础表单.其基本结构如下: & ...
- flask第十篇——url_for【3】
继续url_for的知识点 # coding: utf-8 from flask import Flask, url_forimport flask app = Flask(__name__) ...
- Python图形编程探索系列-09-tkinter与matplotlib结合案例
案例1 案例来自于:https://bbs.csdn.net/topics/390326088 代码示例: import matplotlib matplotlib.use('TkAgg') from ...
- redis清除缓存和连接远程服务器
直接进入命令行输入 1.连接远程redis: redis-cli -h 127.0.0.1 -p 3008 -a pIctur3 (a后是密码) 2.查看缓存:keys * 3.清除缓存:de ...
- 关于redis与memcached区别(转载自stackoverflow)
转载自:http://stackoverflow.com/questions/10558465/memcached-vs-redis Updated 10/9/2014 Today marks the ...