Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖

Listing 1: Finding the bad links (CheckLinks.java)

import java.awt.*;

import javax.swing.*;

import java.net.*;

import java.io.*;

/**

* This example uses a Java spider to scan a Web site

* and check for broken links. Written by Jeff Heaton.

* Jeff Heaton is the author of "Programming Spiders,

* Bots, and Aggregators" by Sybex. Jeff can be contacted

* through his Web site at http://www.jeffheaton.com.

*

* @author Jeff Heaton(http://www.jeffheaton.com)

* @version 1.0

*/

public class CheckLinks extends javax.swing.JFrame implements Runnable,ISpiderReportable {

   /**

    * The constructor. Perform setup here.

    */

   public CheckLinks() {

     //{{INIT_CONTROLS

     setTitle("Find Broken Links");

     getContentPane().setLayout(null);

     setSize(405,288);

     setVisible(false);

     label1.setText("Enter a URL:");

     getContentPane().add(label1);

     label1.setBounds(12,12,84,12);

     begin.setText("Begin");

     begin.setActionCommand("Begin");

     getContentPane().add(begin);

     begin.setBounds(12,36,84,24);

     getContentPane().add(url);

     url.setBounds(108,36,288,24);

     errorScroll.setAutoscrolls(true);

     errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);

     errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);

     errorScroll.setOpaque(true);

     getContentPane().add(errorScroll);

     errorScroll.setBounds(12,120,384,156);

     errors.setEditable(false);

     errorScroll.getViewport().add(errors);

     errors.setBounds(0,0,366,138);

     current.setText("Currently Processing: ");

     getContentPane().add(current);

     current.setBounds(12,72,384,12);

     goodLinksLabel.setText("Good Links: 0");

     getContentPane().add(goodLinksLabel);

     goodLinksLabel.setBounds(12,96,192,12);

     badLinksLabel.setText("Bad Links: 0");

     getContentPane().add(badLinksLabel);

     badLinksLabel.setBounds(216,96,96,12);

     //}}

     //{{INIT_MENUS

     //}}

     //{{REGISTER_LISTENERS

     SymAction lSymAction = new SymAction();

     begin.addActionListener(lSymAction);

     //}}

   }

   /**

    * Main method for the application

    *

    * @param args Not used

    */

   static public void main(String args[]){

     (new CheckLinks()).setVisible(true);

   }

   /**

    * Add notifications.

    */

   public void addNotify(){

     // Record the size of the window prior to calling parent's

     // addNotify.

     Dimension size = getSize();

     super.addNotify();

     if ( frameSizeAdjusted )

       return;

     frameSizeAdjusted = true;

     // Adjust size of frame according to the insets and menu bar

     Insets insets = getInsets();

     javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();

     int menuBarHeight = 0;

     if ( menuBar != null )

       menuBarHeight = menuBar.getPreferredSize().height;

     setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);

   }

   // Used by addNotify

   boolean frameSizeAdjusted = false;

   //{{DECLARE_CONTROLS

   javax.swing.JLabel label1 = new javax.swing.JLabel();

   /**

    * The begin or cancel button

    */

   javax.swing.JButton begin = new javax.swing.JButton();

   /**

    * The URL being processed

    */

   javax.swing.JTextField url = new javax.swing.JTextField();

   /**

    * Scroll the errors.

    */

   javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();

   /**

    * A place to store the errors created

    */

   javax.swing.JTextArea errors = new javax.swing.JTextArea();

   javax.swing.JLabel current = new javax.swing.JLabel();

   javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();

   javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();

   //}}

   //{{DECLARE_MENUS

   //}}

   /**

    * The background spider thread

    */

   protected Thread backgroundThread;

   /**

    * The spider object being used

    */

   protected Spider spider;

   /**

    * The URL that the spider began with

    */

   protected URL base;

   /**

    * How many bad links have been found

    */

   protected int badLinksCount = 0;

   /**

    * How many good links have been found

    */

   protected int goodLinksCount = 0;

   /**

    * Internal class used to dispatch events

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class SymAction implements java.awt.event.ActionListener {

     public void actionPerformed(java.awt.event.ActionEvent event){

       Object object = event.getSource();

       if ( object == begin )

         begin_actionPerformed(event);

     }

   }

   /**

    * Called when the begin or cancel buttons are clicked

    *

    * @param event The event associated with the button.

    */

   void begin_actionPerformed(java.awt.event.ActionEvent event){

     if ( backgroundThread==null ) {

       begin.setLabel("Cancel");

       backgroundThread = new Thread(this);

       backgroundThread.start();

       goodLinksCount=0;

       badLinksCount=0;

     } else {

       spider.cancel();

     }

   }

   /**

    * Perform the background thread operation. This method

    * actually starts the background thread.

    */

   public void run(){

     try {

       errors.setText("");

       spider = new Spider(this);

       spider.clear();

       base = new URL(url.getText());

       spider.addURL(base);

       spider.begin();

       Runnable doLater = new Runnable(){

         public void run(){

           begin.setText("Begin");

         }

       };

       SwingUtilities.invokeLater(doLater);

       backgroundThread=null;

     } catch ( MalformedURLException e ) {

       UpdateErrors err = new UpdateErrors();

       err.msg = "Bad address.";

       SwingUtilities.invokeLater(err);

     }

   }

   /**

    * Called by the spider when a URL is found. It is here

    * that links are validated.

    *

    * @param base The page that the link was found on.

    * @param url The actual link address.

    */

   public boolean spiderFoundURL(URL base,URL url){

     UpdateCurrentStats cs = new UpdateCurrentStats();

     cs.msg = url.toString();

     SwingUtilities.invokeLater(cs);

     if ( !checkLink(url) ) {

       UpdateErrors err = new UpdateErrors();

       err.msg = url+"(on page " + base + ")\n";

       SwingUtilities.invokeLater(err);

       badLinksCount++;

       return false;

     }

     goodLinksCount++;

     if ( !url.getHost().equalsIgnoreCase(base.getHost()) )

       return false;

     else

       return true;

   }

   /**

    * Called when a URL error is found

    *

    * @param url The URL that resulted in an error.

    */

   public void spiderURLError(URL url){

   }

   /**

    * Called internally to check whether a link is good

    *

    * @param url The link that is being checked.

    * @return True if the link was good, false otherwise.

    */

   protected boolean checkLink(URL url){

     try {

       URLConnection connection = url.openConnection();

       connection.connect();

       return true;

     } catch ( IOException e ) {

       return false;

     }

   }

   /**

    * Called when the spider finds an e-mail address

    *

    * @param email The email address the spider found.

    */

   public void spiderFoundEMail(String email){

   }

   /**

    * Internal class used to update the error information

    * in a Thread-Safe way

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class UpdateErrors implements Runnable {

     public String msg;

     public void run(){

       errors.append(msg);

     }

   }

   /**

    * Used to update the current status information

    * in a "Thread-Safe" way

    *

    * @author Jeff Heaton

    * @version 1.0

    */

   class UpdateCurrentStats implements Runnable {

     public String msg;

     public void run(){

       current.setText("Currently Processing: " + msg );

       goodLinksLabel.setText("Good Links: " + goodLinksCount);

       badLinksLabel.setText("Bad Links: " + badLinksCount);

     }

   }

}

Listing 2: Reporting spider events(ISpiderReportable.java)

import java.net.*;

interface ISpiderReportable {

   public boolean spiderFoundURL(URL base,URL url);

   public void spiderURLError(URL url);

   public void spiderFoundEMail(String email);

}

Listing 3: A reusable spider (Spider.java)

import java.util.*;

import java.net.*;

import java.io.*;

import javax.swing.text.*;

import javax.swing.text.html.*;

/**

* That class implements a reusable spider

*

* @author Jeff Heaton(http://www.jeffheaton.com)

* @version 1.0

*/

public class Spider {

   /**

    * A collection of URLs that resulted in an error

    */

   protected Collection workloadError = new ArrayList(3);

   /**

    * A collection of URLs that are waiting to be processed

    */

   protected Collection workloadWaiting = new ArrayList(3);

   /**

    * A collection of URLs that were processed

    */

   protected Collection workloadProcessed = new ArrayList(3);

   /**

    * The class that the spider should report its URLs to

    */

   protected ISpiderReportable report;

   /**

    * A flag that indicates whether this process

    * should be canceled

    */

   protected boolean cancel = false;

   /**

    * The constructor

    *

    * @param report A class that implements the ISpiderReportable

    * interface, that will receive information that the

    * spider finds.

    */

   public Spider(ISpiderReportable report){

     this.report = report;

   }

   /**

    * Get the URLs that resulted in an error.

    *

    * @return A collection of URL's.

    */

   public Collection getWorkloadError(){

     return workloadError;

   }

   /**

    * Get the URLs that were waiting to be processed.

    * You should add one URL to this collection to

    * begin the spider.

    *

    * @return A collection of URLs.

    */

   public Collection getWorkloadWaiting(){

     return workloadWaiting;

   }

   /**

    * Get the URLs that were processed by this spider.

    *

    * @return A collection of URLs.

    */

   public Collection getWorkloadProcessed(){

     return workloadProcessed;

   }

   /**

    * Clear all of the workloads.

    */

   public void clear(){

     getWorkloadError().clear();

     getWorkloadWaiting().clear();

     getWorkloadProcessed().clear();

   }

   /**

    * Set a flag that will cause the begin

    * method to return before it is done.

    */

   public void cancel(){

     cancel = true;

   }

   /**

    * Add a URL for processing.

    *

    * @param url

    */

   public void addURL(URL url){

     if ( getWorkloadWaiting().contains(url) )

       return;

     if ( getWorkloadError().contains(url) )

       return;

     if ( getWorkloadProcessed().contains(url) )

       return;

     log("Adding to workload: " + url );

     getWorkloadWaiting().add(url);

   }

   /**

    * Called internally to process a URL

    *

    * @param url The URL to be processed.

    */

   public void processURL(URL url){

     try {

       log("Processing: " + url );

       // get the URL's contents

       URLConnection connection = url.openConnection();

       if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {

         getWorkloadWaiting().remove(url);

         getWorkloadProcessed().add(url);

         log("Not processing because content type is: " + connection.getContentType() );

         return;

       }

       // read the URL

       InputStream is = connection.getInputStream();

       Reader r = new InputStreamReader(is);

       // parse the URL

       HTMLEditorKit.Parser parse = new HTMLParse().getParser();

       parse.parse(r,new Parser(url),true);

     } catch ( IOException e ) {

       getWorkloadWaiting().remove(url);

       getWorkloadError().add(url);

       log("Error: " + url );

       report.spiderURLError(url);

       return;

     }

     // mark URL as complete

     getWorkloadWaiting().remove(url);

     getWorkloadProcessed().add(url);

     log("Complete: " + url );

   }

   /**

    * Called to start the spider

    */

   public void begin(){

     cancel = false;

     while ( !getWorkloadWaiting().isEmpty() && !cancel ) {

       Object list[] = getWorkloadWaiting().toArray();

       for ( int i=0;(i<list.length)&&!cancel;i++ )

         processURL((URL)list[i]);

     }

   }

/**

* A HTML parser callback used by this class to detect links

*

* @author Jeff Heaton

* @version 1.0

*/

   protected class Parser

   extends HTMLEditorKit.ParserCallback {

     protected URL base;

     public Parser(URL base){

       this.base = base;

     }

     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos){

       String href = (String)a.getAttribute(HTML.Attribute.HREF);

       if( (href==null) && (t==HTML.Tag.FRAME) )

         href = (String)a.getAttribute(HTML.Attribute.SRC);

       if ( href==null )

         return;

       int i = href.indexOf('#');

       if ( i!=-1 )

         href = href.substring(0,i);

       if ( href.toLowerCase().startsWith("mailt") ) {

         report.spiderFoundEMail(href);

         return;

       }

       handleLink(base,href);

     }

     public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos){

       handleSimpleTag(t,a,pos);     // handle the same way

     }

     protected void handleLink(URL base,String str){

       try {

         URL url = new URL(base,str);

         if ( report.spiderFoundURL(base,url) )

           addURL(url);

       } catch ( MalformedURLException e ) {

         log("Found malformed URL: " + str );

       }

     }

   }

   /**

    * Called internally to log information

    * This basic method just writes the log

    * out to the stdout.

    *

    * @param entry The information to be written to the log.

    */

   public void log(String entry){

     System.out.println( (new Date()) + ":" + entry );

   }

}

Listing 4: Parsing HTML (HTMLParse.java)

import javax.swing.text.html.*;

public class HTMLParse extends HTMLEditorKit {

   public HTMLEditorKit.Parser getParser(){

     return super.getParser();

   }

}

Programming a Spider in Java 源码帖的更多相关文章

如何阅读Java源码阅读java的真实体会
刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动. 源码阅读,我觉得最核心有三点:技术基础+强烈的求知欲+耐心. 说到技术基础,我打个比 ...
Android反编译(一)之反编译JAVA源码
Android反编译(一) 之反编译JAVA源码 [目录] 1.工具 2.反编译步骤 3.实例 4.装X技巧 1.工具 1).dex反编译JAR工具 dex2jar http://code.go ...
如何阅读Java源码
刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动.源码阅读,我觉得最核心有三点:技术基础+强烈的求知欲+耐心. 说到技术基础,我打个比方吧, ...
Java 源码学习线路————_先JDK工具包集合_再core包，也就是String、StringBuffer等_Java IO类库
http://www.iteye.com/topic/1113732 原则网址 Java源码初接触如果你进行过一年左右的开发,喜欢用eclipse的debug功能.好了,你现在就有阅读源码的技术基础 ...
解密随机数生成器（二）——从java源码看线性同余算法
Random Java中的Random类生成的是伪随机数,使用的是48-bit的种子,然后调用一个linear congruential formula线性同余方程(Donald Knuth的编程艺术 ...
Java--Eclipse关联Java源码
打开Eclipse,Window->Preferences->Java 点Edit按钮后弹出: 点Source Attachment后弹出: 选择Java安装路径下的src.zip文件即可 ...
使用JDT.AST解析java源码
在做java源码的静态代码审计时,最基础的就是对java文件进行解析,从而获取到此java文件的相关信息: 在java文件中所存在的东西很多,很复杂,难以用相关的正则表达式去一一匹配.但是,eclip ...
[收藏] Java源码阅读的真实体会
收藏自http://www.iteye.com/topic/1113732 刚才在论坛不经意间,看到有关源码阅读的帖子.回想自己前几年,阅读源码那种兴奋和成就感(1),不禁又有一种激动. 源码阅读,我 ...
Java源码解读(一)——HashMap
HashMap作为常用的一种数据结构,阅读源码去了解其底层的实现是十分有必要的.在这里也分享自己阅读源码遇到的困难以及自己的思考. HashMap的源码介绍已经有许许多多的博客,这里只记录了一些我看源 ...

随机推荐

ASP.NET MVC轻教程 Step By Step 5——初识表单
上一节我们将留言列表显示在Index视图里了,现在该添加一个留言的表单,好让用户自己添加留言. 首先在HomeController中添加一个名为“Write”的动作方法. public ActionR ...
Socket 错误总结
错误因为并没有搞清楚accept函数的使用,所以导致不停的发送失败,同时还不知道错误在哪里,无意中看见errno这个库,可以记录错误的原因,才知道原因在于没有用客户端的套接字进行接收数据,而这个客户 ...
MFC 之ActiveX控件学习
本文将介绍ActiveX控件的应用与工作原理,读者可以把ActiveX控件看成一个极小服务器的应用程序,它不能独立运行,必须要嵌入到容器程序中与容器一起运行,就像电脑主机中的显卡,它自己在电脑硬件系统 ...
For Aisha（阿伊莎）
相见时难别亦难,东风无力百花残.by:昂思多,20160524 跟你在一起,没有拘束感,完全就像是在跟亲人对话. 很喜欢这种感觉虽然才认识不到10天,却就像是认识了好几年的老朋友真的喜欢叫你“阿伊 ...
【Java】Java Platform
The Java platform has two components: The Java Virtual Machine The Java Application Programming Inte ...
纯CSS实现delay连续动画
从前css3还没出来的时候,用jquery的delay方法可以串起一个一个独立的动画片段. 那么在不使用jquery的平台上,如何借助css3来完成一些列动作呢? 有高人做了一个动感十足的人物动画: ...
eclipse设置字体大小
eclipse是我们常用的开发工具.eclipse中的默认字体往往并不满足我们的需要,我经常要调节一下它的大小或者换一下风格.eclipse中的字体大小怎么改变呢? 工具/原料 eclipse 方法/ ...
C语言结构体的对齐原则
Q:关于结构体的对齐,到底遵循什么原则?A:首先先不讨论结构体按多少字节对齐,先看看只以1字节对齐的情况: #include <stdio.h> #include <string.h ...
一起啃PRML - 1 Introduction 绪论
一起啃PRML - 1 Introduction @copyright 转载请注明出处 http://www.cnblogs.com/chxer/ 这一部分主要是介绍一下Pattern Recogni ...
WordPress wp-admin/includes/post.php脚本安全漏洞
漏洞名称: WordPress wp-admin/includes/post.php脚本安全漏洞 CNNVD编号: CNNVD-201309-168 发布时间: 2013-09-13 更新时间: 20 ...

Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖的更多相关文章

随机推荐

热门专题