[php]代码库

<?php
// +----------------------------------------------------------------------
// | ThinkPHP [ WE CAN DO IT JUST THINK IT ]
// +----------------------------------------------------------------------
// | Copyright (c) 2009 http://thinkphp.cn All rights reserved.
// +----------------------------------------------------------------------
// +----------------------------------------------------------------------
// | Author: liu21st <liu21st@gmail.com>
// +----------------------------------------------------------------------
 
/**
 * Http 工具类
 * 提供一系列的Http方法
 * @category   ORG
 * @package  ORG
 * @subpackage  Net
 * @author    liu21st <liu21st@gmail.com>
 */
class Http {
 
    /**
     * 采集远程文件
     * @access public
     * @param string $remote 远程文件名
     * @param string $local 本地保存文件名
     * @return mixed
     */
    static public function curlDownload($remote,$local) {
        $cp = curl_init($remote);
        $fp = fopen($local,"w");
        curl_setopt($cp, CURLOPT_FILE, $fp);
        curl_setopt($cp, CURLOPT_HEADER, 0);
        curl_exec($cp);
        curl_close($cp);
        fclose($fp);
    }
 
   /**
    * 使用 fsockopen 通过 HTTP 协议直接访问(采集)远程文件
    * 如果主机或服务器没有开启 CURL 扩展可考虑使用
    * fsockopen 比 CURL 稍慢,但性能稳定
    * @static
    * @access public
    * @param string $url 远程URL
    * @param array $conf 其他配置信息
    *        int   limit 分段读取字符个数
    *        string post  post的内容,字符串或数组,key=value&形式
    *        string cookie 携带cookie访问,该参数是cookie内容
    *        string ip    如果该参数传入,$url将不被使用,ip访问优先
    *        int    timeout 采集超时时间
    *        bool   block 是否阻塞访问,默认为true
    * @return mixed
    */
    static public function fsockopenDownload($url, $conf = array()) {
        $return = '';
        if(!is_array($conf)) return $return;
 
        $matches = parse_url($url);
        !isset($matches['host'])    && $matches['host']     = '';
        !isset($matches['path'])    && $matches['path']     = '';
        !isset($matches['query'])   && $matches['query']    = '';
        !isset($matches['port'])    && $matches['port']     = '';
        $host = $matches['host'];
        $path = $matches['path'] ? $matches['path'].($matches['query'] ? '?'.$matches['query'] : '') : '/';
        $port = !empty($matches['port']) ? $matches['port'] : 80;
 
        $conf_arr = array(
            'limit'     =>   0,
            'post'      =>   '',
            'cookie'    =>   '',
            'ip'        =>   '',
            'timeout'   =>   15,
            'block'     =>   TRUE,
            );
 
        foreach (array_merge($conf_arr, $conf) as $k=>$v) ${$k} = $v;
 
        if($post) {
            if(is_array($post))
            {
                $post = http_build_query($post);
            }
            $out  = "POST $path HTTP/1.0\r\n";
            $out .= "Accept: */*\r\n";
            //$out .= "Referer: $boardurl\r\n";
            $out .= "Accept-Language: zh-cn\r\n";
            $out .= "Content-Type: application/x-www-form-urlencoded\r\n";
            $out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]\r\n";
            $out .= "Host: $host\r\n";
            $out .= 'Content-Length: '.strlen($post)."\r\n";
            $out .= "Connection: Close\r\n";
            $out .= "Cache-Control: no-cache\r\n";
            $out .= "Cookie: $cookie\r\n\r\n";
            $out .= $post;
        } else {
            $out  = "GET $path HTTP/1.0\r\n";
            $out .= "Accept: */*\r\n";
            //$out .= "Referer: $boardurl\r\n";
            $out .= "Accept-Language: zh-cn\r\n";
            $out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]\r\n";
            $out .= "Host: $host\r\n";
            $out .= "Connection: Close\r\n";
            $out .= "Cookie: $cookie\r\n\r\n";
        }
        $fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, $timeout);
        if(!$fp) {
            return '';
        } else {
            stream_set_blocking($fp, $block);
            stream_set_timeout($fp, $timeout);
            @fwrite($fp, $out);
            $status = stream_get_meta_data($fp);
            if(!$status['timed_out']) {
                while (!feof($fp)) {
                    if(($header = @fgets($fp)) && ($header == "\r\n" ||  $header == "\n")) {
                        break;
                    }
                }
 
                $stop = false;
                while(!feof($fp) && !$stop) {
                    $data = fread($fp, ($limit == 0 || $limit > 8192 ? 8192 : $limit));
                    $return .= $data;
                    if($limit) {
                        $limit -= strlen($data);
                        $stop = $limit <= 0;
                    }
                }
            }
            @fclose($fp);
            return $return;
        }
    }
 
    /**
     * 下载文件
     * 可以指定下载显示的文件名,并自动发送相应的Header信息
     * 如果指定了content参数,则下载该参数的内容
     * @static
     * @access public
     * @param string $filename 下载文件名
     * @param string $showname 下载显示的文件名
     * @param string $content  下载的内容
     * @param integer $expire  下载内容浏览器缓存时间
     * @return void
     */
    static public function download ($filename, $showname='',$content='',$expire=180) {
        if(is_file($filename)) {
            $length = filesize($filename);
        }elseif(is_file(UPLOAD_PATH.$filename)) {
            $filename = UPLOAD_PATH.$filename;
            $length = filesize($filename);
        }elseif($content != '') {
            $length = strlen($content);
        }else {
            throw_exception($filename.L('下载文件不存在!'));
        }
        if(empty($showname)) {
            $showname = $filename;
        }
        $showname = basename($showname);
        if(!empty($filename)) {
            $type = mime_content_type($filename);
        }else{
            $type    =   "application/octet-stream";
        }
        //发送Http Header信息 开始下载
        header("Pragma: public");
        header("Cache-control: max-age=".$expire);
        //header('Cache-Control: no-store, no-cache, must-revalidate');
        header("Expires: " . gmdate("D, d M Y H:i:s",time()+$expire) . "GMT");
        header("Last-Modified: " . gmdate("D, d M Y H:i:s",time()) . "GMT");
        header("Content-Disposition: attachment; filename=".$showname);
        header("Content-Length: ".$length);
        header("Content-type: ".$type);
        header('Content-Encoding: none');
        header("Content-Transfer-Encoding: binary" );
        if($content == '' ) {
            readfile($filename);
        }else {
            echo($content);
        }
        exit();
    }
 
    /**
     * 显示HTTP Header 信息
     * @return string
     */
    static function getHeaderInfo($header='',$echo=true) {
        ob_start();
        $headers    = getallheaders();
        if(!empty($header)) {
            $info   = $headers[$header];
            echo($header.':'.$info."\n"); ;
        }else {
            foreach($headers as $key=>$val) {
                echo("$key:$val\n");
            }
        }
        $output     = ob_get_clean();
        if ($echo) {
            echo (nl2br($output));
        }else {
            return $output;
        }
 
    }
 
    /**
     * HTTP Protocol defined status codes
     * @param int $num
     */
    static function sendHttpStatus($code) {
        static $_status = array(
            // Informational 1xx
            100 => 'Continue',
            101 => 'Switching Protocols',
 
            // Success 2xx
            200 => 'OK',
            201 => 'Created',
            202 => 'Accepted',
            203 => 'Non-Authoritative Information',
            204 => 'No Content',
            205 => 'Reset Content',
            206 => 'Partial Content',
 
            // Redirection 3xx
            300 => 'Multiple Choices',
            301 => 'Moved Permanently',
            302 => 'Found'// 1.1
            303 => 'See Other',
            304 => 'Not Modified',
            305 => 'Use Proxy',
            // 306 is deprecated but reserved
            307 => 'Temporary Redirect',
 
            // Client Error 4xx
            400 => 'Bad Request',
            401 => 'Unauthorized',
            402 => 'Payment Required',
            403 => 'Forbidden',
            404 => 'Not Found',
            405 => 'Method Not Allowed',
            406 => 'Not Acceptable',
            407 => 'Proxy Authentication Required',
            408 => 'Request Timeout',
            409 => 'Conflict',
            410 => 'Gone',
            411 => 'Length Required',
            412 => 'Precondition Failed',
            413 => 'Request Entity Too Large',
            414 => 'Request-URI Too Long',
            415 => 'Unsupported Media Type',
            416 => 'Requested Range Not Satisfiable',
            417 => 'Expectation Failed',
 
            // Server Error 5xx
            500 => 'Internal Server Error',
            501 => 'Not Implemented',
            502 => 'Bad Gateway',
            503 => 'Service Unavailable',
            504 => 'Gateway Timeout',
            505 => 'HTTP Version Not Supported',
            509 => 'Bandwidth Limit Exceeded'
        );
        if(isset($_status[$code])) {
            header('HTTP/1.1 '.$code.' '.$_status[$code]);
        }
    }
}//类定义结束
if( !function_exists ('mime_content_type')) {
    /**
     * 获取文件的mime_content类型
     * @return string
     */
    function mime_content_type($filename) {
       static $contentType = array(
            'ai'        => 'application/postscript',
            'aif'       => 'audio/x-aiff',
            'aifc'      => 'audio/x-aiff',
            'aiff'      => 'audio/x-aiff',
            'asc'       => 'application/pgp', //changed by skwashd - was text/plain
            'asf'       => 'video/x-ms-asf',
            'asx'       => 'video/x-ms-asf',
            'au'        => 'audio/basic',
            'avi'       => 'video/x-msvideo',
            'bcpio'     => 'application/x-bcpio',
            'bin'       => 'application/octet-stream',
            'bmp'       => 'image/bmp',
            'c'         => 'text/plain', // or 'text/x-csrc', //added by skwashd
            'cc'        => 'text/plain', // or 'text/x-c++src', //added by skwashd
            'cs'        => 'text/plain', //added by skwashd - for C# src
            'cpp'       => 'text/x-c++src', //added by skwashd
            'cxx'       => 'text/x-c++src', //added by skwashd
            'cdf'       => 'application/x-netcdf',
            'class'     => 'application/octet-stream',//secure but application/java-class is correct
            'com'       => 'application/octet-stream',//added by skwashd
            'cpio'      => 'application/x-cpio',
            'cpt'       => 'application/mac-compactpro',
            'csh'       => 'application/x-csh',
            'css'       => 'text/css',
            'csv'       => 'text/comma-separated-values',//added by skwashd
            'dcr'       => 'application/x-director',
            'diff'      => 'text/diff',
            'dir'       => 'application/x-director',
            'dll'       => 'application/octet-stream',
            'dms'       => 'application/octet-stream',
            'doc'       => 'application/msword',
            'dot'       => 'application/msword',//added by skwashd
            'dvi'       => 'application/x-dvi',
            'dxr'       => 'application/x-director',
            'eps'       => 'application/postscript',
            'etx'       => 'text/x-setext',
            'exe'       => 'application/octet-stream',
            'ez'        => 'application/andrew-inset',
            'gif'       => 'image/gif',
            'gtar'      => 'application/x-gtar',
            'gz'        => 'application/x-gzip',
            'h'         => 'text/plain', // or 'text/x-chdr',//added by skwashd
            'h++'       => 'text/plain', // or 'text/x-c++hdr', //added by skwashd
            'hh'        => 'text/plain', // or 'text/x-c++hdr', //added by skwashd
            'hpp'       => 'text/plain', // or 'text/x-c++hdr', //added by skwashd
            'hxx'       => 'text/plain', // or 'text/x-c++hdr', //added by skwashd
            'hdf'       => 'application/x-hdf',
            'hqx'       => 'application/mac-binhex40',
            'htm'       => 'text/html',
            'html'      => 'text/html',
            'ice'       => 'x-conference/x-cooltalk',
            'ics'       => 'text/calendar',
            'ief'       => 'image/ief',
            'ifb'       => 'text/calendar',
            'iges'      => 'model/iges',
            'igs'       => 'model/iges',
            'jar'       => 'application/x-jar', //added by skwashd - alternative mime type
            'java'      => 'text/x-java-source', //added by skwashd
            'jpe'       => 'image/jpeg',
            'jpeg'      => 'image/jpeg',
            'jpg'       => 'image/jpeg',
            'js'        => 'application/x-javascript',
            'kar'       => 'audio/midi',
            'latex'     => 'application/x-latex',
            'lha'       => 'application/octet-stream',
            'log'       => 'text/plain',
            'lzh'       => 'application/octet-stream',
            'm3u'       => 'audio/x-mpegurl',
            'man'       => 'application/x-troff-man',
            'me'        => 'application/x-troff-me',
            'mesh'      => 'model/mesh',
            'mid'       => 'audio/midi',
            'midi'      => 'audio/midi',
            'mif'       => 'application/vnd.mif',
            'mov'       => 'video/quicktime',
            'movie'     => 'video/x-sgi-movie',
            'mp2'       => 'audio/mpeg',
            'mp3'       => 'audio/mpeg',
            'mpe'       => 'video/mpeg',
            'mpeg'      => 'video/mpeg',
            'mpg'       => 'video/mpeg',
            'mpga'      => 'audio/mpeg',
            'ms'        => 'application/x-troff-ms',
            'msh'       => 'model/mesh',
            'mxu'       => 'video/vnd.mpegurl',
            'nc'        => 'application/x-netcdf',
            'oda'       => 'application/oda',
            'patch'     => 'text/diff',
            'pbm'       => 'image/x-portable-bitmap',
            'pdb'       => 'chemical/x-pdb',
            'pdf'       => 'application/pdf',
            'pgm'       => 'image/x-portable-graymap',
            'pgn'       => 'application/x-chess-pgn',
            'pgp'       => 'application/pgp',//added by skwashd
            'php'       => 'application/x-httpd-php',
            'php3'      => 'application/x-httpd-php3',
            'pl'        => 'application/x-perl',
            'pm'        => 'application/x-perl',
            'png'       => 'image/png',
            'pnm'       => 'image/x-portable-anymap',
            'po'        => 'text/plain',
            'ppm'       => 'image/x-portable-pixmap',
            'ppt'       => 'application/vnd.ms-powerpoint',
            'ps'        => 'application/postscript',
            'qt'        => 'video/quicktime',
            'ra'        => 'audio/x-realaudio',
            'rar'       => 'application/octet-stream',
            'ram'       => 'audio/x-pn-realaudio',
            'ras'       => 'image/x-cmu-raster',
            'rgb'       => 'image/x-rgb',
            'rm'        => 'audio/x-pn-realaudio',
            'roff'      => 'application/x-troff',
            'rpm'       => 'audio/x-pn-realaudio-plugin',
            'rtf'       => 'text/rtf',
            'rtx'       => 'text/richtext',
            'sgm'       => 'text/sgml',
            'sgml'      => 'text/sgml',
            'sh'        => 'application/x-sh',
            'shar'      => 'application/x-shar',
            'shtml'     => 'text/html',
            'silo'      => 'model/mesh',
            'sit'       => 'application/x-stuffit',
            'skd'       => 'application/x-koan',
            'skm'       => 'application/x-koan',
            'skp'       => 'application/x-koan',
            'skt'       => 'application/x-koan',
            'smi'       => 'application/smil',
            'smil'      => 'application/smil',
            'snd'       => 'audio/basic',
            'so'        => 'application/octet-stream',
            'spl'       => 'application/x-futuresplash',
            'src'       => 'application/x-wais-source',
            'stc'       => 'application/vnd.sun.xml.calc.template',
            'std'       => 'application/vnd.sun.xml.draw.template',
            'sti'       => 'application/vnd.sun.xml.impress.template',
            'stw'       => 'application/vnd.sun.xml.writer.template',
            'sv4cpio'   => 'application/x-sv4cpio',
            'sv4crc'    => 'application/x-sv4crc',
            'swf'       => 'application/x-shockwave-flash',
            'sxc'       => 'application/vnd.sun.xml.calc',
            'sxd'       => 'application/vnd.sun.xml.draw',
            'sxg'       => 'application/vnd.sun.xml.writer.global',
            'sxi'       => 'application/vnd.sun.xml.impress',
            'sxm'       => 'application/vnd.sun.xml.math',
            'sxw'       => 'application/vnd.sun.xml.writer',
            't'         => 'application/x-troff',
            'tar'       => 'application/x-tar',
            'tcl'       => 'application/x-tcl',
            'tex'       => 'application/x-tex',
            'texi'      => 'application/x-texinfo',
            'texinfo'   => 'application/x-texinfo',
            'tgz'       => 'application/x-gtar',
            'tif'       => 'image/tiff',
            'tiff'      => 'image/tiff',
            'tr'        => 'application/x-troff',
            'tsv'       => 'text/tab-separated-values',
            'txt'       => 'text/plain',
            'ustar'     => 'application/x-ustar',
            'vbs'       => 'text/plain', //added by skwashd - for obvious reasons
            'vcd'       => 'application/x-cdlink',
            'vcf'       => 'text/x-vcard',
            'vcs'       => 'text/calendar',
            'vfb'       => 'text/calendar',
            'vrml'      => 'model/vrml',
            'vsd'       => 'application/vnd.visio',
            'wav'       => 'audio/x-wav',
            'wax'       => 'audio/x-ms-wax',
            'wbmp'      => 'image/vnd.wap.wbmp',
            'wbxml'     => 'application/vnd.wap.wbxml',
            'wm'        => 'video/x-ms-wm',
            'wma'       => 'audio/x-ms-wma',
            'wmd'       => 'application/x-ms-wmd',
            'wml'       => 'text/vnd.wap.wml',
            'wmlc'      => 'application/vnd.wap.wmlc',
            'wmls'      => 'text/vnd.wap.wmlscript',
            'wmlsc'     => 'application/vnd.wap.wmlscriptc',
            'wmv'       => 'video/x-ms-wmv',
            'wmx'       => 'video/x-ms-wmx',
            'wmz'       => 'application/x-ms-wmz',
            'wrl'       => 'model/vrml',
            'wvx'       => 'video/x-ms-wvx',
            'xbm'       => 'image/x-xbitmap',
            'xht'       => 'application/xhtml+xml',
            'xhtml'     => 'application/xhtml+xml',
            'xls'       => 'application/vnd.ms-excel',
            'xlt'       => 'application/vnd.ms-excel',
            'xml'       => 'application/xml',
            'xpm'       => 'image/x-xpixmap',
            'xsl'       => 'text/xml',
            'xwd'       => 'image/x-xwindowdump',
            'xyz'       => 'chemical/x-xyz',
            'z'         => 'application/x-compress',
            'zip'       => 'application/zip',
       );
       $type = strtolower(substr(strrchr($filename, '.'),1));
       if(isset($contentType[$type])) {
            $mime = $contentType[$type];
       }else {
            $mime = 'application/octet-stream';
       }
       return $mime;
    }
}
 
if(!function_exists('image_type_to_extension')){
   function image_type_to_extension($imagetype) {
       if(empty($imagetype)) return false;
       switch($imagetype) {
           case IMAGETYPE_GIF       : return '.gif';
           case IMAGETYPE_JPEG      : return '.jpg';
           case IMAGETYPE_PNG       : return '.png';
           case IMAGETYPE_SWF       : return '.swf';
           case IMAGETYPE_PSD       : return '.psd';
           case IMAGETYPE_BMP       : return '.bmp';
           case IMAGETYPE_TIFF_II   : return '.tiff';
           case IMAGETYPE_TIFF_MM   : return '.tiff';
           case IMAGETYPE_JPC       : return '.jpc';
           case IMAGETYPE_JP2       : return '.jp2';
           case IMAGETYPE_JPX       : return '.jpf';
           case IMAGETYPE_JB2       : return '.jb2';
           case IMAGETYPE_SWC       : return '.swc';
           case IMAGETYPE_IFF       : return '.aiff';
           case IMAGETYPE_WBMP      : return '.wbmp';
           case IMAGETYPE_XBM       : return '.xbm';
           default                  : return false;
       }
   }
 

}

JQERUY方式筛选采集内容,相信很多大牛都知道这个类库,可自学出身的我还是找了N久,phpquery,Snoopy等一遍一遍尝试,最后才在无意中找到phpSimpleHtmlDom,更让人惊喜的是又找到了中文手册.
一个人的学习,漫长而又艰辛,真希望有时候能得到指点,不至于让时间无辜的流失.

基础代码获取网页建议用CURL,附加POST数据可以登陆后采集

  1. <?php
  2. require_once('./simple_html_dom.php');
  3. $url='http://www.w3cschool.cc/';
  4. $Curl=curl_init();//实例化cURL
  5. curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
  6. curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
  7. curl_setopt($Curl, CURLOPT_HEADER, 1);//0关闭打印相应头,直接打印需为1,
  8. $result=curl_exec($Curl);//执行一个cURL会话
  9. curl_close($Curl);//关闭cURL会话
  10. $html = str_get_html($result);//创建DOM
  11. foreach($html->find('#leftcolumn a') as $element) {
  12. echo $element->href . '<br>';//获取URL
  13. echo $element->plaintext . '<br>';//获取纯文本
  14. }
  15. $html->clear();
  16. unset($html);
复制代码

中文手册(作者: S.C. Chen):
http://www.ecartchina.com/php-simple-html-dom/index.htm

采集淘宝测试

  1. require_once('simple_html_dom.php');
  2. ini_set("time_limit","0");
  3. ini_set("memory_limit","512M");
  4. $memory=memory_get_usage();
  5. echo 'memory:'.($memory/1024).'KB<br/>';
  6. echo 'time:'.date('H:i:s',time()).'<br/>';
  7. function curl_get_content($url){
  8. $Curl=curl_init();//实例化cURL
  9. curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
  10. curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
  11. curl_setopt($Curl, CURLOPT_HEADER, 0);//0关闭打印相应头,直接打印需为1,
  12. $result=curl_exec($Curl);//执行一个cURL会话
  13. curl_close($Curl);//关闭cURL会话
  14. return $result;
  15. }
  16. $cateUrl='http://the-seventh-sense.taobao.com/';
  17. $cateCon=curl_get_content($cateUrl);
  18. $cateHtml = str_get_html($cateCon);//创建DOM
  19. $CateList=array();
  20. $i=0;
  21. foreach($cateHtml->find('.J_TAllCatsTree li .fst-cat-hd a[href*=category]') as $element) {
  22. $CateList[$i]['url']=urldecode($element->href);//获取URL
  23. $CateList[$i]['name']=$element->plaintext;//获取纯文本
  24. $i++;
  25. }
  26. $cateHtml->clear();
  27. unset($cateHtml);
  28. $i=0;
  29. foreach ($CateList as $goodsUrl) {
  30. $goodsCon=curl_get_content($goodsUrl['url']);
  31. $goodsHtml = str_get_html($goodsCon);//创建DOM
  32. $goodsBlock=$goodsHtml->find('.shop-hesper-bd .item');
  33. foreach($goodsBlock as $goodsElement ) {
  34. $goodsList[$i]['name']=$goodsElement->find(".detail .item-name",0)->plaintext;
  35. $goodsList[$i]['price']=$goodsElement->find(".detail .c-price",0)->plaintext;
  36. $goodsList[$i]['img']=$goodsElement->find(".photo a img",0)->src;
  37. $goodsList[$i]['catename']=$goodsUrl['name'];
  38. $i++;
  39. }
  40. $goodsHtml->clear();
  41. unset($goodsHtml);
  42. }
  43. echo '<hr/>';
  44. $n1=count($CateList);
  45. $n2=count($goodsList);
  46. echo '采集'.$n1.'条栏目'.$n2.'个商品<br/>';
  47. $memory=memory_get_usage();
  48. echo 'memory:'.($memory/1024).'KB<br/>';
  49. echo 'time:'.date('H:i:s',time()).'<br/>';
复制代码

beginmemory:971.953125KB
begintime:05:30:19
overmemory:1352.890625KB
overtime:05:30:39
耗时20s,成功采集9个栏目127个商品

phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息。更有意思的是,它采用了jQuery的思想,你可以像使用jQuery一样处理页面内容,获取你想要的页面信息。

采集头条

先看一实例,现在我要采集新浪网国内新闻的头条,代码如下:

include 'phpQuery/phpQuery.php'; 
phpQuery::newDocumentFile('http://news.sina.com.cn/china'); 
echo pq(".blkTop h1:eq(0)")->html(); 

简单的三行代码,就可以获取头条内容。首先在程序中包含phpQuery.php核心程序,然后调用读取目标网页,最后输出对应标签下的内容。

pq()是一个功能强大的方法,跟jQuery的$()如出一辙,jQuery的选择器基本上都能使用在phpQuery上,只要把“.”变成“->”。如上例中,pq(".blkTop h1:eq(0)")抓取了页面class属性为blkTop的DIV元素,并找到该DIV内部的第一个h1标签,然后用html()方法获取h1标签里的内容(带html标签),也就是我们要获取的头条信息,如果使用text()方法,则只获取头条的文本内容。当然要使用好phpQuery,关键是要找对文档中对应内容的节点。

采集文章列表

下面再来看一个例子,获取helloweba.com网站的blog列表,请看代码:

include 'phpQuery/phpQuery.php'; 
phpQuery::newDocumentFile('http://www.helloweba.com/blog.html'); 
$artlist = pq(".blog_li"); 
foreach($artlist as $li){ 
   echo pq($li)->find('h2')->html().""; 

通过循环列表中的DIV,找出文章标题并输出,就是这么简单。

解析XML文档

假设现在有一个这样的test.xml文档:

<?xml version="1.0" encoding="utf-8"?> 
<root> 
  <contact> 
     <name>张三</name> 
     <age>22</age> 
  </contact> 
  <contact> 
     <name>王五</name> 
     <age>18</age> 
  </contact> 
</root> 

现在我要获取名字为张三的联系人的年龄,代码如下:

include 'phpQuery/phpQuery.php'; 
phpQuery::newDocumentFile('test.xml'); 
echo pq('contact > age:eq(0)'); 

结果输出:22

像jQuery一样,精准查找文档节点,输出节点下的内容,解析一个XML文档就是这么简单。现在你不必为采集网站内容而使用那些头疼的正则算法、内容替换等繁琐的代码了,有了phpQuery,一切就变得轻松多了。

项目官网地址:http://code.google.com/p/phpquery/

ThinkPHP Http工具类(用于远程采集 远程下载) phpSimpleHtmlDom采集类库_Jquery筛选方式 使用phpQuery轻松采集网页内容http://www.thinkphp.cn/extend/541.html的更多相关文章

  1. phpQuery轻松采集网页内容

    原文地址:phpQuery轻松采集网页内容作者:陌上花开 phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息.更有意思的是,它 ...

  2. Thinkphp自定义工具类的使用!

    在使用Thinkphp做开发的时候,很多时候会用到一些自己写的类,为了方便管理,可以把这些类,单独放到一个文件里. 这就是自定义工具类: 首先在 Application 目录下新建 Component ...

  3. 使用phpQuery轻松采集网页内容

    phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息.更有意思的是,它采用了jQuery的思想,你可以像使用jQuery一样处理 ...

  4. 工具类: 用于模拟HTTP请求中GET/POST方式

    package com.jarvis.base.util; import java.io.BufferedReader; import java.io.IOException; import java ...

  5. Google的java工具类Guava

    前言 google开发java项目肯定也不想重复造轮子,所以肯定也有工具类,就是它了:Guava 我将举例几个实际的例子,发挥这个工具类好用的功能.更多的方法和功能,还有内部的实现可以直接参考http ...

  6. commons-lang3-3.2.jar中的常用工具类的使用

    这个包中的很多工具类可以简化我们的操作,在这里简单的研究其中的几个工具类的使用. 1.StringUtils工具类 可以判断是否是空串,是否为null,默认值设置等操作: /** * StringUt ...

  7. Java精选笔记_集合概述(Collection接口、Collections工具类、Arrays工具类)

    集合概述 集合有时又称为容器,简单地说,它是一个对象,能将具有相同性质的多个元素汇聚成一个整体.集合被用于存储.获取.操纵和传输聚合的数据. 使用集合的技巧 看到Array就是数组结构,有角标,查询速 ...

  8. SerializeUtil 序列化,反序列化工具类

    package cloud.app.prod.home.utils; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutp ...

  9. 动态代理模式_应用(Redis工具类)

    本次使用动态代理的初衷是学习Redis,使用Java操作Redis时用到Jedis的JedisPool,而后对Jedis的方法进一步封装完善成为一个工具类.因为直接使用Jedis对象时,为了保证性能, ...

随机推荐

  1. UITextField点击选中文字

    1.先创建UITextField - (void)viewDidLoad { [super viewDidLoad]; // Do any additional setup after loading ...

  2. 2018.11.13 N4010A 通信设置

    设置电脑之IP地址及Subnet mask.      IP address: 192.168.1.2      Subnet mask: 255.255.255.0, 其它选项为默认. 然后点击OK ...

  3. New Concept English three(21)

    27W 59 Boxing matches were very popular in England two hundred years ago. In those days, boxers foug ...

  4. CSS 清除浮动 clear 属性

    CSS 清除浮动 clear 属性用于设定元素哪一侧不允许有其他浮动元素(而并非取消元素的浮动). 可能的取值如下: 取值 说明 none 默认值,允许两侧都有浮动元素 left 左侧不允许有其他浮动 ...

  5. 关于$_SERVER['PHP_AUTH_USER']

    http://www.cnblogs.com/thinksasa/p/3421379.html PHP 的 HTTP 认证机制仅在 PHP 以 Apache 模块方式运行时才有效,因此该功能不适用于 ...

  6. Lua基础---一维数组与多维数组

    Lua语言中,数组和C还是有区别的,Lua的数组下标从1开始计数,而C语言的数组下标从0开始计数,我想这可能是设计Lua的人想要符合人的思维习惯而去这么设计的. 数组,也就是按相同类型,在内存中顺序排 ...

  7. chrome 中for-in 在遍历对象时的顺序问题

  8. POJ2987 Firing 【最大权闭合图】

    POJ2987 Firing Description You've finally got mad at "the world's most stupid" employees o ...

  9. ThinkPHP5 使用create 获取表单所有字段

    TP5没有 TP3的那个create创建表单字段,如果字段太多,写起来是非常麻烦 只需要在 框架里面 think/db/Query.php 里面加上函数 public function create( ...

  10. matplotlib ----- 清空图片

    关闭单个图: fig = plt.figure(0) # 新图 0 plt.savefig() # 保存 plt. close(0) # 关闭图 0   关闭所有图不用管 fig 号码 fig = p ...