html网页采集
UI_Less.pas:
unit UI_Less; interface uses
Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX; const
WM_USER_STARTWALKING = WM_USER + ;
DISPID_AMBIENT_DLCONTROL = (-);
READYSTATE_COMPLETE = $; DLCTL_DLIMAGES = $;
DLCTL_VIDEOS = $;
DLCTL_BGSOUNDS = $;
DLCTL_NO_SCRIPTS = $;
DLCTL_NO_JAVA = $;
DLCTL_NO_RUNACTIVEXCTLS = $;
DLCTL_NO_DLACTIVEXCTLS = $;
DLCTL_DOWNLOADONLY = $;
DLCTL_NO_FRAMEDOWNLOAD = $;
DLCTL_RESYNCHRONIZE = $;
DLCTL_PRAGMA_NO_CACHE = $;
DLCTL_NO_BEHAVIORS = $;
DLCTL_NO_METACHARSET = $;
DLCTL_URL_ENCODING_DISABLE_UTF8 = $;
DLCTL_URL_ENCODING_ENABLE_UTF8 = $;
DLCTL_FORCEOFFLINE = $;
DLCTL_NO_CLIENTPULL = $;
DLCTL_SILENT = $;
DLCTL_OFFLINEIFNOTCONNECTED = $;
DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED; type
TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink,
IOleClientSite)
private
FDocTitle: string;
FBodyText: TStrings;
FBodyHtml: TStrings;
protected
/// IDISPATCH
function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
stdcall;
/// IPROPERTYNOTIFYSINK
function OnChanged(DispID: TDispID): HResult; stdcall;
function OnRequestEdit(DispID: TDispID): HResult; stdcall;
/// IOLECLIENTSITE
function SaveObject: HResult; stdcall;
function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult; stdcall;
function GetContainer(out container: IOleContainer): HResult; stdcall;
function ShowObject: HResult; stdcall;
function OnShowWindow(fShow: BOOL): HResult; stdcall;
function RequestNewObjectLayout: HResult; stdcall;
///
function LoadUrlFromMoniker: HResult;
function LoadUrlFromFile: HResult;
// * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead. public
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
property DocTitle: string read FDocTitle;
property BodyText: TStrings read FBodyText write FBodyText;
property BodyHtml: TStrings read FBodyHtml write FBodyHtml;
function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings);
procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
end; implementation var
Doc: IhtmlDocument2;
_URL: PWidechar; constructor TUILess.Create(AOwner: TComponent);
begin
inherited Create(AOwner);
FBodyText := TStringList.Create;
FBodyHtml := TStringList.Create;
end; destructor TUILess.Destroy;
begin
if Assigned(FBodyText) then
FBodyText.Free;
if Assigned(FBodyHtml) then
FBodyHtml.Free;
inherited Destroy;
end; /// CORE ---->>>>>>>>>
function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
var
Cookie: Integer;
CP: IConnectionPoint;
OleObject: IOleObject;
OleControl: IOleControl;
CPC: IConnectionPointContainer;
All: IHTMLElement;
Msg: TMsg;
hr: HResult;
begin
_URL := URL;
IsSucceed := false;
try
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, Doc);
OleObject := Doc as IOleObject;
OleObject.SetClientSite(self);
OleControl := Doc as IOleControl;
OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
CPC := Doc as IConnectionPointContainer;
CPC.FindConnectionPoint(IPropertyNotifySink, CP);
CP.Advise(self, Cookie);
hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile;
if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then
while (GetMessage(Msg, , , )) do
begin
if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = )) then
begin
PostQuitMessage();
result := Doc.Get_all;
All := Doc.Get_body;
FDocTitle := string(Doc.nameProp);
FBodyText.Text := string(All.outerText);
FBodyHtml.Text := string(All.outerHTML);
IsSucceed := true;
end
else
DispatchMessage(Msg);
if IsStop then
Exit;
end;
except
Exit;
end;
end; function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
var
I: Integer;
begin
if DISPID_AMBIENT_DLCONTROL = DispID then
begin
I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA +
DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS;
PVariant(VarResult)^ := I;
result := S_OK;
end
else
result := DISP_E_MEMBERNOTFOUND;
end; function TUILess.OnChanged(DispID: TDispID): HResult;
var
dp: TDispParams;
vResult: OleVariant;
begin
if (DISPID_READYSTATE = DispID) then
if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null,
LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil))
then
if Integer(vResult) = READYSTATE_COMPLETE then
PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, , );
end; function TUILess.LoadUrlFromMoniker: HResult;
var
Moniker: IMoniker;
BindCtx: IBindCTX;
PM: IPersistMoniker;
begin
createURLMoniker(nil, _URL, Moniker);
CreateBindCtx(, BindCtx);
PM := Doc as IPersistMoniker;
result := PM.Load(LongBool(), Moniker, BindCtx, STGM_READ)
end; function TUILess.LoadUrlFromFile: HResult;
var
PF: IPersistfile;
begin
PF := Doc as IPersistfile;
result := PF.Load(_URL, );
end; // 获取图像链接
procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
var
Image: IHTMLImgElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then
ImageList.add(string(Image.src));
end;
end;
end; // 获取链接
procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection;
Anchorlist: TStrings);
var
anchor: IHTMLAnchorElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and
(anchor.href <> '')) then
Anchorlist.add(string(anchor.href));
end;
end;
end; /// Don't Care ------>>>>>>>>>>>
function TUILess.OnRequestEdit(DispID: TDispID): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.SaveObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetContainer(out container: IOleContainer): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.ShowObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.OnShowWindow(fShow: BOOL): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.RequestNewObjectLayout: HResult;
begin
result := E_NOTIMPL;
end; end.
Unit3.pas:
unit Unit3; interface uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls; type
TForm3 = class(TForm)
Button1: TButton;
Edit1: TEdit;
Memo1: TMemo;
Button2: TButton;
Button3: TButton;
Button4: TButton;
procedure Button1Click(Sender: TObject);
procedure Button2Click(Sender: TObject);
procedure Button3Click(Sender: TObject);
procedure Button4Click(Sender: TObject);
private
{ Private declarations }
procedure into(i: Word);
public
{ Public declarations }
end; var
Form3: TForm3; implementation
uses UI_Less; {$R *.dfm} function DoStrToWideChar(s: string): PWideChar;
var
// s:sting;
pwc: PWidechar;
len: integer;
begin
// s:= 'abcdefg ';
len := length(s) + ;
pwc := AllocMem(len * sizeof(widechar));
stringtowidechar(s, pwc, len);
// showmessage(widechartostring(pwc)); result := pwc;
// FreeMem(pwc);
end; procedure TForm3.into(i: Word);
var
sh: TUILess;
su: boolean; // 是否获取成功
// isstop: boolean; //设全局变量可以中断连接 ,避免出错
surl: PWideChar;
begin
surl := DoStrToWideChar(Trim(Edit1.Text));
sh := TUILess.Create(nil);
try
Memo1.Clear;
case i of
:
sh.GetAnchorList(sh.get(surl, su, False), Memo1.Lines);
:
sh.GetImageList(sh.get(surl, su, False), Memo1.Lines);
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyText;
end;
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyHtml;
end;
end;
finally
//sh.Free;
end;
end; procedure TForm3.Button1Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button2Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button3Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button4Click(Sender: TObject);
begin
into();
end; end.
html网页采集的更多相关文章
- Hawk 3. 网页采集器
1.基本入门 1. 原理(建议阅读) 网页采集器的功能是获取网页中的数据(废话).通常来说,目标可能是列表(如购物车列表),或是一个页面中的固定字段(如JD某商品的价格和介绍,在页面中只有一个).因此 ...
- Fiddler 网页采集抓包利器
最近这段时间,网页采集方面的工作做得比较多.用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示:基于weiphp做了一个掌上 ...
- Fiddler 网页采集抓包利器__手机app抓包
用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示: 基于weiphp做了一个掌上头条插件,也是用的网页采集技术:和一个创业 ...
- 网页采集利器 phpQuery
网页采集利器 phpQuery 2012-02-28 11:43:24| 分类: php|举报|字号 订阅 在网页采集的时候,通常都会用到正则表达式.但是有时候对于正则不太好的同学,比如我, ...
- 网页采集器-UA伪装
网页采集器-UA伪装 UA伪装 请求载体身份标识的伪装: User-Agent: 请求载体身份标识,通过浏览器发起的请求,请求载体为浏览器,则该请求的User-Agent为浏览器的身份标识,如果使用爬 ...
- 异步网页采集利器CasperJs
在采集网页中,我们会经常遇到采集一些异步加载页面的网页,我们通常用的httpwebrequest类就采集不到了,这个时候我们通常会采用webbrowser来辅助采集,但是.net下自带的webbrow ...
- 简单的网页采集程序(ASP.NET MVC4)
因为懒人太多,造成现在网页数据采集非常的流行,我也来写个简单的记录一下. 之前写了MVC的基本框架的搭建随笔,后面因为公司太忙,个人感情问题:(,导致不想写了,就写了两篇给删除了,现在就搁浅了, 本人 ...
- 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】
做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...
- C#网页采集数据的几种方式(WebClient、WebBrowser和HttpWebRequest/HttpWebResponse)
一.通过WebClient获取网页内容 这是一种很简单的获取方式,当然,其它的获取方法也很简单.在这里首先要说明的是,如果为了实际项目的效率考虑,需要考虑在函数中分配一个内存区域.大概写法如下 //M ...
- 网页采集(通过HtmlAgilityPack+XPath)
有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...
随机推荐
- Mac配置Eclipse CDT的Debug出现的问题(转)
问题1:出现 Could not determine GDB version using command: gdb --version 原因: mac上没有安装gdb或者gdb位置配置有问题 解决 ...
- 006.Docker网络管理
一 docker网络模式 Docker使用Linux的Namespaces技术来进行资源隔离,如PID Namespace隔离进程,Mount Namespace隔离文件系统,Network Name ...
- NDK官方下载链接
注:本文转载于成江海:<Android各个版本的NDK官方下载链接 > NDK官方网站:https://developer.android.google.cn/ndk/downloads/ ...
- Python3 Srcapy 爬虫
最近一直在理论学习,没有时间写博客.今天来一波Python爬虫,为机器学习做数据准备. 爬虫配置环境 Anaconda3 + Spyder + Scrapy Anaconda 安装就不绍了,网上很多. ...
- spring整合ssmXML版
以下是一个简单的ssm项目:如果中途报错,肯定是tomcat配置或者数据库配置有问题,在程序中注意将包名等配置换成自己的.数据库表需要提前建好,并加入数据,注意表结构要和实体对象对应. 1.开发条件: ...
- c#获取程序版本号
Content.Text = "程序集版本:" + System.Reflection.Assembly.GetExecutingAssembly().GetName().Vers ...
- 添加js,css 版本号?v= hash
node_modules设置 a.打开 node_modules\gulp-rev\index.js 第144行 manifest[originalFile] = revisionedFile; 更新 ...
- 深入学习webpack
webpack配置是标准的Node.js CommonJS模块,webpack中的绝对路径指"/src/css/file",相对路径指"../css/file" ...
- 构造函数与getter和setter的区别
构造函数是用于初始化类的属性,且只有在创建对象时才会调用构造函数,用于给对象分配地址 无参的构造函数,创建对象时默认调用,当程序没有明确写出有参的构造函数,系统会默认的创建一个. 有参的构造函数,创建 ...
- 修正 Mui 下拉上拉刷新功能
下拉增加动态时间计算功能: 上拉增加状态文字提示功能(当然也支持时间计算功能,只是我们暂时用不到):