转自Crawling the Web with Java By James Holmes

无需任何扩展包,可直接运行。

  1. import java.awt.*;
  2. import java.awt.event.*;
  3. import java.io.*;
  4. import java.net.*;
  5. import java.util.*;
  6. import java.util.regex.*;
  7. import javax.swing.*;
  8. import javax.swing.table.*;
  9. // The Search Web Crawler
  10.  
  11. public class SearchCrawler extends JFrame
  12. {
  13. // Max URLs drop-down values.
  14. private static final String[] MAX_URLS =
  15. {"50", "100", "500", "1000"};
  16. // Cache of robot disallow lists.
  17. private HashMap disallowListCache = new HashMap();
  18. // Search GUI controls.
  19. private JTextField startTextField;
  20. private JComboBox maxComboBox;
  21. private JCheckBox limitCheckBox;
  22. private JTextField logTextField;
  23. private JTextField searchTextField;
  24. private JCheckBox caseCheckBox;
  25. private JButton searchButton;
  26. // Search stats GUI controls.
  27. private JLabel crawlingLabel2;
  28. private JLabel crawledLabel2;
  29. private JLabel toCrawlLabel2;
  30. private JProgressBar progressBar;
  31. private JLabel matchesLabel2;
  32. // Table listing search matches.
  33. private JTable table;// Flag for whether or not crawling is underway.
  34. private boolean crawling;
  35. // Matches log file print writer.
  36. private PrintWriter logFileWriter;
  37. // Constructor for Search Web Crawler.
  38. public SearchCrawler()
  39. {
  40. // Set application title.
  41. setTitle("Search Crawler");
  42. // Set window size.
  43. setSize(600, 600);
  44. // Handle window closing events.
  45. addWindowListener(new WindowAdapter() {
  46. public void windowClosing(WindowEvent e) {
  47. actionExit();
  48. }
  49. });
  50. // Set up File menu.
  51. JMenuBar menuBar = new JMenuBar();
  52. JMenu fileMenu = new JMenu("File");
  53. fileMenu.setMnemonic(KeyEvent.VK_F);
  54. JMenuItem fileExitMenuItem = new JMenuItem("Exit",
  55. KeyEvent.VK_X);
  56. fileExitMenuItem.addActionListener(new ActionListener() {
  57. public void actionPerformed(ActionEvent e) {
  58. actionExit();
  59. }
  60. });
  61. fileMenu.add(fileExitMenuItem);
  62. menuBar.add(fileMenu);
  63. setJMenuBar(menuBar);
  64. // Set up search panel.
  65. JPanel searchPanel = new JPanel();
  66. GridBagConstraints constraints;
  67. GridBagLayout layout = new GridBagLayout();
  68. searchPanel.setLayout(layout);
  69. JLabel startLabel = new JLabel("Start URL:");
  70. constraints = new GridBagConstraints();
  71. constraints.anchor = GridBagConstraints.EAST;
  72. constraints.insets = new Insets(5, 5, 0, 0);
  73. layout.setConstraints(startLabel, constraints);
  74. searchPanel.add(startLabel);
  75. startTextField = new JTextField();
  76. constraints = new GridBagConstraints();
  77. constraints.fill = GridBagConstraints.HORIZONTAL;
  78. constraints.gridwidth = GridBagConstraints.REMAINDER;
  79. constraints.insets = new Insets(5, 5, 0, 5);
  80. layout.setConstraints(startTextField, constraints);
  81. searchPanel.add(startTextField);
  82. JLabel maxLabel = new JLabel("Max URLs to Crawl:");
  83. constraints = new GridBagConstraints();
  84. constraints.anchor = GridBagConstraints.EAST;
  85. constraints.insets = new Insets(5, 5, 0, 0);
  86. layout.setConstraints(maxLabel, constraints);
  87. searchPanel.add(maxLabel);
  88. maxComboBox = new JComboBox(MAX_URLS);
  89. maxComboBox.setEditable(true);
  90. constraints = new GridBagConstraints();
  91. constraints.insets = new Insets(5, 5, 0, 0);
  92. layout.setConstraints(maxComboBox, constraints);
  93. searchPanel.add(maxComboBox);
  94. limitCheckBox =
  95. new JCheckBox("Limit crawling to Start URL site");
  96. constraints = new GridBagConstraints();
  97. constraints.anchor = GridBagConstraints.WEST;
  98. constraints.insets = new Insets(0, 10, 0, 0);
  99. layout.setConstraints(limitCheckBox, constraints);
  100. searchPanel.add(limitCheckBox);
  101. JLabel blankLabel = new JLabel();
  102. constraints = new GridBagConstraints();
  103. constraints.gridwidth = GridBagConstraints.REMAINDER;
  104. layout.setConstraints(blankLabel, constraints);
  105. searchPanel.add(blankLabel);
  106. JLabel logLabel = new JLabel("Matches Log File:");
  107. constraints = new GridBagConstraints();
  108. constraints.anchor = GridBagConstraints.EAST;
  109. constraints.insets = new Insets(5, 5, 0, 0);
  110. layout.setConstraints(logLabel, constraints);
  111. searchPanel.add(logLabel);
  112. String file =
  113. System.getProperty("user.dir") +
  114. System.getProperty("file.separator") +
  115. "crawler.log";
  116. logTextField = new JTextField(file);
  117. constraints = new GridBagConstraints();
  118. constraints.fill = GridBagConstraints.HORIZONTAL;
  119. constraints.gridwidth = GridBagConstraints.REMAINDER;
  120. constraints.insets = new Insets(5, 5, 0, 5);
  121. layout.setConstraints(logTextField, constraints);
  122. searchPanel.add(logTextField);
  123. JLabel searchLabel = new JLabel("Search String:");
  124. constraints = new GridBagConstraints();
  125. constraints.anchor = GridBagConstraints.EAST;
  126. constraints.insets = new Insets(5, 5, 0, 0);
  127. layout.setConstraints(searchLabel, constraints);
  128. searchPanel.add(searchLabel);
  129. searchTextField = new JTextField();
  130. constraints = new GridBagConstraints();
  131. constraints.fill = GridBagConstraints.HORIZONTAL;
  132. constraints.insets = new Insets(5, 5, 0, 0);
  133. constraints.gridwidth= 2;
  134. constraints.weightx = 1.0d;
  135. layout.setConstraints(searchTextField, constraints);
  136. searchPanel.add(searchTextField);
  137. caseCheckBox = new JCheckBox("Case Sensitive");
  138. constraints = new GridBagConstraints();
  139. constraints.insets = new Insets(5, 5, 0, 5);
  140. constraints.gridwidth = GridBagConstraints.REMAINDER;
  141. layout.setConstraints(caseCheckBox, constraints);
  142. searchPanel.add(caseCheckBox);
  143. searchButton = new JButton("Search");
  144. searchButton.addActionListener(new ActionListener() {
  145. public void actionPerformed(ActionEvent e) {
  146. actionSearch();
  147. }
  148. });
  149. constraints = new GridBagConstraints();
  150. constraints.gridwidth = GridBagConstraints.REMAINDER;
  151. constraints.insets = new Insets(5, 5, 5, 5);
  152. layout.setConstraints(searchButton, constraints);
  153. searchPanel.add(searchButton);
  154. JSeparator separator = new JSeparator();
  155. constraints = new GridBagConstraints();
  156. constraints.fill = GridBagConstraints.HORIZONTAL;
  157. constraints.gridwidth = GridBagConstraints.REMAINDER;
  158. constraints.insets = new Insets(5, 5, 5, 5);
  159. layout.setConstraints(separator, constraints);
  160. searchPanel.add(separator);
  161. JLabel crawlingLabel1 = new JLabel("Crawling:");
  162. constraints = new GridBagConstraints();
  163. constraints.anchor = GridBagConstraints.EAST;
  164. constraints.insets = new Insets(5, 5, 0, 0);
  165. layout.setConstraints(crawlingLabel1, constraints);
  166. searchPanel.add(crawlingLabel1);
  167. crawlingLabel2 = new JLabel();
  168. crawlingLabel2.setFont(
  169. crawlingLabel2.getFont().deriveFont(Font.PLAIN));
  170. constraints = new GridBagConstraints();
  171. constraints.fill = GridBagConstraints.HORIZONTAL;
  172. constraints.gridwidth = GridBagConstraints.REMAINDER;
  173. constraints.insets = new Insets(5, 5, 0, 5);
  174. layout.setConstraints(crawlingLabel2, constraints);
  175. searchPanel.add(crawlingLabel2);
  176. JLabel crawledLabel1 = new JLabel("Crawled URLs:");
  177. constraints = new GridBagConstraints();
  178. constraints.anchor = GridBagConstraints.EAST;
  179. constraints.insets = new Insets(5, 5, 0, 0);
  180. layout.setConstraints(crawledLabel1, constraints);
  181. searchPanel.add(crawledLabel1);
  182. crawledLabel2 = new JLabel();
  183. crawledLabel2.setFont(
  184. crawledLabel2.getFont().deriveFont(Font.PLAIN));
  185. constraints = new GridBagConstraints();
  186. constraints.fill = GridBagConstraints.HORIZONTAL;
  187. constraints.gridwidth = GridBagConstraints.REMAINDER;
  188. constraints.insets = new Insets(5, 5, 0, 5);
  189. layout.setConstraints(crawledLabel2, constraints);
  190. searchPanel.add(crawledLabel2);
  191. JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
  192. constraints = new GridBagConstraints();
  193. constraints.anchor = GridBagConstraints.EAST;
  194. constraints.insets = new Insets(5, 5, 0, 0);
  195. layout.setConstraints(toCrawlLabel1, constraints);
  196. searchPanel.add(toCrawlLabel1);
  197. toCrawlLabel2 = new JLabel();
  198. toCrawlLabel2.setFont(
  199. toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
  200. constraints = new GridBagConstraints();
  201. constraints.fill = GridBagConstraints.HORIZONTAL;
  202. constraints.gridwidth = GridBagConstraints.REMAINDER;
  203. constraints.insets = new Insets(5, 5, 0, 5);
  204. layout.setConstraints(toCrawlLabel2, constraints);
  205. searchPanel.add(toCrawlLabel2);
  206. JLabel progressLabel = new JLabel("Crawling Progress:");
  207. constraints = new GridBagConstraints();
  208. constraints.anchor = GridBagConstraints.EAST;
  209. constraints.insets = new Insets(5, 5, 0, 0);
  210. layout.setConstraints(progressLabel, constraints);
  211. searchPanel.add(progressLabel);
  212. progressBar = new JProgressBar();
  213. progressBar.setMinimum(0);
  214. progressBar.setStringPainted(true);
  215. constraints = new GridBagConstraints();
  216. constraints.fill = GridBagConstraints.HORIZONTAL;
  217. constraints.gridwidth = GridBagConstraints.REMAINDER;
  218. constraints.insets = new Insets(5, 5, 0, 5);
  219. layout.setConstraints(progressBar, constraints);
  220. searchPanel.add(progressBar);
  221. JLabel matchesLabel1 = new JLabel("Search Matches:");
  222. constraints = new GridBagConstraints();
  223. constraints.anchor = GridBagConstraints.EAST;
  224. constraints.insets = new Insets(5, 5, 10, 0);
  225. layout.setConstraints(matchesLabel1, constraints);
  226. searchPanel.add(matchesLabel1);matchesLabel2 = new JLabel();
  227. matchesLabel2.setFont(
  228. matchesLabel2.getFont().deriveFont(Font.PLAIN));
  229. constraints = new GridBagConstraints();
  230. constraints.fill = GridBagConstraints.HORIZONTAL;
  231. constraints.gridwidth = GridBagConstraints.REMAINDER;
  232. constraints.insets = new Insets(5, 5, 10, 5);
  233. layout.setConstraints(matchesLabel2, constraints);
  234. searchPanel.add(matchesLabel2);
  235. // Set up matches table.
  236. table =
  237. new JTable(new DefaultTableModel(new Object[][]{},
  238. new String[]{"URL"}) {
  239. public boolean isCellEditable(int row, int column)
  240. {
  241. return false;
  242. }
  243. });
  244. // Set up Matches panel.
  245. JPanel matchesPanel = new JPanel();
  246. matchesPanel.setBorder(
  247. BorderFactory.createTitledBorder("Matches"));
  248. matchesPanel.setLayout(new BorderLayout());
  249. matchesPanel.add(new JScrollPane(table),
  250. BorderLayout.CENTER);
  251. // Add panels to display.
  252. getContentPane().setLayout(new BorderLayout());
  253. getContentPane().add(searchPanel, BorderLayout.NORTH);
  254. getContentPane().add(matchesPanel, BorderLayout.CENTER);
  255. }
  256. // Exit this program.
  257. private void actionExit() {
  258. System.exit(0);
  259. }
  260. // Handle Search/Stop button being clicked.
  261. private void actionSearch() {
  262. // If stop button clicked, turn crawling flag off.
  263. if (crawling) {
  264. crawling = false;
  265. return;
  266. }
  267. ArrayList errorList = new ArrayList();
  268. // Validate that start URL has been entered.
  269. String startUrl = startTextField.getText().trim();
  270. if (startUrl.length() < 1) {
  271. errorList.add("Missing Start URL.");
  272. }
  273. // Verify start URL.
  274. else if (verifyUrl(startUrl) == null) {
  275. errorList.add("Invalid Start URL.");
  276. }
  277. // Validate that Max URLs is either empty or is a number.
  278. int maxUrls = 0;
  279. String max = ((String) maxComboBox.getSelectedItem()).trim();
  280. if (max.length() > 0) {
  281. try {
  282. maxUrls = Integer.parseInt(max);
  283. } catch (NumberFormatException e) {
  284. }
  285. if (maxUrls < 1) {
  286. errorList.add("Invalid Max URLs value.");
  287. }
  288. }
  289. // Validate that matches log file has been entered.
  290. String logFile = logTextField.getText().trim();
  291. if (logFile.length() < 1) {
  292. errorList.add("Missing Matches Log File.");
  293. }
  294. // Validate that search string has been entered.
  295. String searchString = searchTextField.getText().trim();
  296. if (searchString.length() < 1) {
  297. errorList.add("Missing Search String.");
  298. }
  299. // Show errors, if any, and return.
  300. if (errorList.size() > 0) {
  301. StringBuffer message = new StringBuffer();
  302. // Concatenate errors into single message.
  303. for (int i = 0; i < errorList.size(); i++) {
  304. message.append(errorList.get(i));
  305. if (i + 1 < errorList.size()) {
  306. message.append("\n");
  307. }
  308. }
  309. showError(message.toString());
  310. return;
  311. }
  312. // Remove "www" from start URL if present.
  313. startUrl = removeWwwFromUrl(startUrl);
  314. // Start the Search Crawler.
  315. search(logFile, startUrl, maxUrls, searchString);
  316. }
  317. private void search(final String logFile, final String startUrl,
  318. final int maxUrls, final String searchString)
  319. {
  320. // Start the search in a new thread.
  321. Thread thread = new Thread(new Runnable() {
  322. public void run() {
  323. // Show hour glass cursor while crawling is under way.
  324. setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
  325. // Disable search controls.
  326. startTextField.setEnabled(false);
  327. maxComboBox.setEnabled(false);
  328. limitCheckBox.setEnabled(false);
  329. logTextField.setEnabled(false);
  330. searchTextField.setEnabled(false);
  331. caseCheckBox.setEnabled(false);
  332. // Switch Search button to "Stop."
  333. searchButton.setText("Stop");
  334. // Reset stats.
  335. table.setModel(new DefaultTableModel(new Object[][]{},
  336. new String[]{"URL"}) {
  337. public boolean isCellEditable(int row, int column)
  338. {
  339. return false;
  340. }
  341. });
  342. updateStats(startUrl, 0, 0, maxUrls);
  343. // Open matches log file.
  344. try {
  345. logFileWriter = new PrintWriter(new FileWriter(logFile));
  346. } catch (Exception e) {
  347. showError("Unable to open matches log file.");
  348. return;
  349. }
  350. // Turn crawling flag on.
  351. crawling = true;
  352. // Perform the actual crawling.
  353. crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
  354. searchString, caseCheckBox.isSelected());
  355. // Turn crawling flag off.
  356. crawling = false;
  357. // Close matches log file.
  358. try {
  359. logFileWriter.close();
  360. } catch (Exception e) {
  361. showError("Unable to close matches log file.");
  362. }
  363. // Mark search as done.
  364. crawlingLabel2.setText("Done");
  365. // Enable search controls.
  366. startTextField.setEnabled(true);
  367. maxComboBox.setEnabled(true);
  368. limitCheckBox.setEnabled(true);
  369. logTextField.setEnabled(true);
  370. searchTextField.setEnabled(true);
  371. caseCheckBox.setEnabled(true);
  372. // Switch search button back to "Search."
  373. searchButton.setText("Search");
  374. // Return to default cursor.
  375. setCursor(Cursor.getDefaultCursor());
  376. // Show message if search string not found.
  377. if (table.getRowCount() == 0) {
  378. JOptionPane.showMessageDialog(SearchCrawler.this,
  379. "Your Search String was not found. Please try another.",
  380. "Search String Not Found",
  381. JOptionPane.WARNING_MESSAGE);
  382. }
  383. }
  384. });
  385. thread.start();
  386. }
  387. // Show dialog box with error message.
  388. private void showError(String message) {
  389. JOptionPane.showMessageDialog(this, message, "Error",
  390. JOptionPane.ERROR_MESSAGE);
  391. }
  392. // Update crawling stats.
  393. private void updateStats(
  394. String crawling, int crawled, int toCrawl, int maxUrls)
  395. {
  396. crawlingLabel2.setText(crawling);
  397. crawledLabel2.setText("" + crawled);
  398. toCrawlLabel2.setText("" + toCrawl);
  399. // Update progress bar.
  400. if (maxUrls == -1) {
  401. progressBar.setMaximum(crawled + toCrawl);
  402. } else {
  403. progressBar.setMaximum(maxUrls);
  404. }
  405. progressBar.setValue(crawled);
  406. matchesLabel2.setText("" + table.getRowCount());
  407. }
  408. // Add match to matches table and log file.
  409. private void addMatch(String url) {
  410. // Add URL to matches table.
  411. DefaultTableModel model =
  412. (DefaultTableModel) table.getModel();
  413. model.addRow(new Object[]{url});
  414. // Add URL to matches log file.
  415. try {
  416. logFileWriter.println(url);
  417. } catch (Exception e) {
  418. showError("Unable to log match.");
  419. }
  420. }
  421. // Verify URL format.
  422. private URL verifyUrl(String url) {
  423. // Only allow HTTP URLs.
  424. if (!url.toLowerCase().startsWith("http://"))
  425. return null;
  426. // Verify format of URL.
  427. URL verifiedUrl = null;
  428. try {
  429. verifiedUrl = new URL(url);
  430. } catch (Exception e) {
  431. return null;
  432. }
  433. return verifiedUrl;
  434. }
  435. // Check if robot is allowed to access the given URL.
  436. private boolean isRobotAllowed(URL urlToCheck) {
  437. String host = urlToCheck.getHost().toLowerCase();
  438. // Retrieve host's disallow list from cache.
  439. ArrayList disallowList =
  440. (ArrayList) disallowListCache.get(host);
  441. // If list is not in the cache, download and cache it.
  442. if (disallowList == null) {
  443. disallowList = new ArrayList();
  444. try {
  445. URL robotsFileUrl =
  446. new URL("http://" + host + "/robots.txt");
  447. // Open connection to robot file URL for reading.
  448. BufferedReader reader =
  449. new BufferedReader(new InputStreamReader(
  450. robotsFileUrl.openStream()));
  451. // Read robot file, creating list of disallowed paths.
  452. String line;
  453. while ((line = reader.readLine()) != null) {
  454. if (line.indexOf("Disallow:") == 0) {
  455. String disallowPath =
  456. line.substring("Disallow:".length());
  457. // Check disallow path for comments and remove if present.
  458. int commentIndex = disallowPath.indexOf("#");
  459. if (commentIndex != - 1) {
  460. disallowPath =
  461. disallowPath.substring(0, commentIndex);
  462. }
  463. // Remove leading or trailing spaces from disallow path.
  464. disallowPath = disallowPath.trim();
  465. // Add disallow path to list.
  466. disallowList.add(disallowPath);
  467. }
  468. }
  469. // Add new disallow list to cache.
  470. disallowListCache.put(host, disallowList);
  471. }
  472. catch (Exception e) {
  473. /* Assume robot is allowed since an exception
  474. is thrown if the robot file doesn't exist. */
  475. return true;
  476. }
  477. }
  478. /* Loop through disallow list to see if
  479. crawling is allowed for the given URL. */
  480. String file = urlToCheck.getFile();
  481. for (int i = 0; i < disallowList.size(); i++) {
  482. String disallow = (String) disallowList.get(i);
  483. if (file.startsWith(disallow)) {
  484. return false;
  485. }
  486. }
  487. return true;
  488. }
  489. // Download page at given URL.
  490. private String downloadPage(URL pageUrl) {
  491. try {
  492. // Open connection to URL for reading.
  493. BufferedReader reader =
  494. new BufferedReader(new InputStreamReader(
  495. pageUrl.openStream()));
  496. // Read page into buffer.
  497. String line;
  498. StringBuffer pageBuffer = new StringBuffer();
  499. while ((line = reader.readLine()) != null) {
  500. pageBuffer.append(line);
  501. }
  502. return pageBuffer.toString();
  503. } catch (Exception e) {
  504. }
  505. return null;
  506. }
  507. // Remove leading "www" from a URL's host if present.
  508. private String removeWwwFromUrl(String url) {
  509. int index = url.indexOf("://www.");
  510. if (index != -1) {
  511. return url.substring(0, index + 3) +
  512. url.substring(index + 7);
  513. }
  514. return (url);
  515. }
  516. // Parse through page contents and retrieve links.
  517. private ArrayList retrieveLinks(
  518. URL pageUrl, String pageContents, HashSet crawledList,
  519. boolean limitHost)
  520. {
  521. // Compile link matching pattern.
  522. Pattern p =
  523. Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
  524. Pattern.CASE_INSENSITIVE);
  525. Matcher m = p.matcher(pageContents);
  526. // Create list of link matches.
  527. ArrayList linkList = new ArrayList();
  528. while (m.find()) {
  529. String link = m.group(1).trim();
  530. // Skip empty links.
  531. if (link.length() < 1) {
  532. continue;
  533. }
  534. // Skip links that are just page anchors.
  535. if (link.charAt(0) == '#') {
  536. continue;
  537. }
  538. // Skip mailto links.
  539. if (link.indexOf("mailto:") != -1) {
  540. continue;
  541. }
  542. // Skip JavaScript links.
  543. if (link.toLowerCase().indexOf("javascript") != -1) {
  544. continue;
  545. }
  546. // Prefix absolute and relative URLs if necessary.
  547. if (link.indexOf("://") == -1) {
  548. // Handle absolute URLs.
  549. if (link.charAt(0) == '/') {
  550. link = "http://" + pageUrl.getHost() + link;
  551. // Handle relative URLs.
  552. } else {
  553. String file = pageUrl.getFile();
  554. if (file.indexOf('/') == -1) {
  555. link = "http://" + pageUrl.getHost() + "/" + link;
  556. } else {
  557. String path =
  558. file.substring(0, file.lastIndexOf('/') + 1);
  559. link = "http://" + pageUrl.getHost() + path + link;
  560. }
  561. }
  562. }
  563. // Remove anchors from link.
  564. int index = link.indexOf('#');
  565. if (index != -1) {
  566. link = link.substring(0, index);
  567. }
  568. // Remove leading "www" from URL's host if present.
  569. link = removeWwwFromUrl(link);
  570. // Verify link and skip if invalid.
  571. URL verifiedLink = verifyUrl(link);
  572. if (verifiedLink == null) {
  573. continue;
  574. }
  575. /* If specified, limit links to those
  576. having the same host as the start URL. */
  577. if (limitHost &&
  578. !pageUrl.getHost().toLowerCase().equals(
  579. verifiedLink.getHost().toLowerCase()))
  580. {
  581. continue;
  582. }
  583. // Skip link if it has already been crawled.
  584. if (crawledList.contains(link)) {
  585. continue;
  586. }
  587. // Add link to list.
  588. linkList.add(link);
  589. }
  590. return (linkList);
  591. }
  592. /* Determine whether or not search string is
  593. matched in the given page contents. */
  594. private boolean searchStringMatches(
  595. String pageContents, String searchString,
  596. boolean caseSensitive)
  597. {
  598. String searchContents = pageContents;
  599. /* If case-sensitive search, lowercase
  600. page contents for comparison. */
  601. if (!caseSensitive) {
  602. searchContents = pageContents.toLowerCase();
  603. }// Split search string into individual terms.
  604. Pattern p = Pattern.compile("[\\s]+");
  605. String[] terms = p.split(searchString);
  606. // Check to see if each term matches.
  607. for (int i = 0; i < terms.length; i++) {
  608. if (caseSensitive) {
  609. if (searchContents.indexOf(terms[i]) == -1) {
  610. return false;
  611. }
  612. } else {
  613. if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
  614. return false;
  615. }
  616. }
  617. }
  618. return true;
  619. }
  620. // Perform the actual crawling, searching for the search string.
  621. public void crawl(
  622. String startUrl, int maxUrls, boolean limitHost,
  623. String searchString, boolean caseSensitive)
  624. {
  625. // Set up crawl lists.
  626. HashSet crawledList = new HashSet();
  627. LinkedHashSet toCrawlList = new LinkedHashSet();
  628. // Add start URL to the to crawl list.
  629. toCrawlList.add(startUrl);
  630. /* Perform actual crawling by looping
  631. through the To Crawl list. */
  632. while (crawling && toCrawlList.size() > 0)
  633. {
  634. /* Check to see if the max URL count has
  635. been reached, if it was specified.*/
  636. if (maxUrls != -1) {
  637. if (crawledList.size() == maxUrls) {
  638. break;
  639. }
  640. }
  641. // Get URL at bottom of the list.
  642. String url = (String) toCrawlList.iterator().next();
  643. // Remove URL from the To Crawl list.
  644. toCrawlList.remove(url);
  645. // Convert string url to URL object.
  646. URL verifiedUrl = verifyUrl(url);
  647. // Skip URL if robots are not allowed to access it.
  648. if (!isRobotAllowed(verifiedUrl)) {
  649. continue;
  650. }
  651. // Update crawling stats.
  652. updateStats(url, crawledList.size(), toCrawlList.size(),
  653. maxUrls);
  654. // Add page to the crawled list.
  655. crawledList.add(url);
  656. // Download the page at the given URL.
  657. String pageContents = downloadPage(verifiedUrl);
  658. /* If the page was downloaded successfully, retrieve all its
  659. links and then see if it contains the search string. */
  660. if (pageContents != null && pageContents.length() > 0)
  661. {
  662. // Retrieve list of valid links from page.
  663. ArrayList links =
  664. retrieveLinks(verifiedUrl, pageContents, crawledList,
  665. limitHost);
  666. // Add links to the To Crawl list.
  667. toCrawlList.addAll(links);
  668. /* Check if search string is present in
  669. page, and if so, record a match. */
  670. if (searchStringMatches(pageContents, searchString,
  671. caseSensitive))
  672. {
  673. addMatch(url);
  674. }
  675. }
  676. // Update crawling stats.
  677. updateStats(url, crawledList.size(), toCrawlList.size(),
  678. maxUrls);
  679. }
  680. }
  681. // Run the Search Crawler.
  682. public static void main(String[] args) {
  683. SearchCrawler crawler = new SearchCrawler();
  684. crawler.show();
  685. }
  686. }

【收藏】SearchCrawler By James Holmes的更多相关文章

  1. php面试题及答案收藏(转)

    php面试题及答案收藏(这套试题是在网上看到的,不知作者是谁) 基础题 1.表单中 get与post提交方法的区别? 答:get是发送请求HTTP协议通过url参数传递进行接收,而post是实体数据, ...

  2. PHP常见的一些问题总结(收藏)

    本篇文章给大家带来的内容是关于PHP常见的一些问题总结(收藏),有一定的参考价值,有需要的朋友可以参考一下,希望对你有所帮助. 1. 字符串定义的时候单引号和双引号有什么区别? 单引号加载速度比双引号 ...

  3. JavaScript初学者福利!必须收藏的24条小技巧

    JavaScript初学者福利!必须收藏的24条小技巧 前端小编 发布于 2013-12-15 22:52 查看数: 2343 评论数: 6 帖子模式 这篇文章将回顾JavaScript的知识 !如果 ...

  4. ZT 感触的屌丝职场记 投递人 itwriter 发布于 2013-05-27 09:21 评论(18) 有3402人阅读 原文链接 [收藏] « »   作者@幻想哥呀幻想哥   有一位屌丝男,从小抱着报效祖国的理想上了大学,毕业后干了 IT 行业,高中那时候看文汇报说,搞 IT 的在上

    屌丝职场记 投递人 itwriter 发布于 2013-05-27 09:21 评论(18) 有3402人阅读  原文链接  [收藏]  « » 作者@幻想哥呀幻想哥 有一位屌丝男,从小抱着报效祖国的 ...

  5. [转帖]可能是东半球最好的 Curl 学习指南,强烈建议收藏!

    可能是东半球最好的 Curl 学习指南,强烈建议收藏! http://www.itpub.net/2019/09/30/3302/ 记得转帖过.. 简介 curl 是常用的命令行工具,用来请求 Web ...

  6. h5应用缓存及收藏时Icon显示

    h5应用实现离线缓存,加载后,断网仍然可以继续使用. 一.需求 转行做h5,目前做赛道游戏,动手做了个赛道编辑器web版的,由于web版需要开启服务器才能使用,策划要想回家使用就要发布到外网服务器了, ...

  7. 基于Metronic的Bootstrap开发框架经验总结(13)--页面链接收藏夹功能的实现2(利用Sortable进行拖动排序)

    在上篇随笔<基于Metronic的Bootstrap开发框架经验总结(12)--页面链接收藏夹功能的实现>上,我介绍了链接收藏夹功能的实现,以及对收藏记录的排序处理.该篇随笔主要使用功能按 ...

  8. 修复 Windows7 资源管理器左侧收藏夹无法展开问题

    相信大家在网上搜多到的解决办法大多数都是修改注册表,但是这个办法多数是无效的 1.运行regedit 2.展开到HKEY_CLASSES_ROOT\lnkfile 3.添加一个字符串值:IsShort ...

  9. JavaMail和James

      JavaMail,顾名思义,提供给开发者处理电子邮件相关的编程接口.它是Sun发布的用来处理email的API.它可以方便地执行一些常用的邮件传输.我们可以基于JavaMail开发出类似于Micr ...

随机推荐

  1. 解决easyui tabs中href无法跨域跳转

    <!DOCTYPE HTML> <html> <head> <meta http-equiv="content-type" content ...

  2. 易于同其它View框架(Tiles等)无缝集成,采用IOC便于测试

    Lifecycle for overriding binding, validation, etc,易于同其它View框架(Tiles等)无缝集成,采用IOC便于测试. 它是一个典型的教科书式的mvc ...

  3. python 函数 参数 (难点传入dict list)

    --使用参数组可以传值列表,字典:格式 #-*-coding:utf-8-*- def func5(x,*s,**gs): print(x) print(s) print(gs) print '*'* ...

  4. redis的下载

    网址一:https://github.com/dmajkic/redis/downloads 网址二:http://windows.php.net/downloads/pecl/releases/re ...

  5. thinkPHP为什么设置一个单入口文件?

    TP3.2的具体解释: ThinkPHP采用单一入口模式进行项目部署和访问,无论完成什么功能,一个应用都有一个统一(但不一定是唯一)的入口. 应该说,所有应用都是从入口文件开始的,并且不同应用的入口文 ...

  6. Log4j 使用

    源博客 http://www.cnblogs.com/alipayhutu/archive/2012/06/21/2558249.html#3159794 [1]从零开始 a). 新建Java Pro ...

  7. 5-1、easyUI-菜单与按钮(上节问题与解决)

    首先把上节的代码copy过来,如下: <html> <head> <meta http-equiv="Content-Type" content=&q ...

  8. PHP面向过程和面向对象

    php程序编写分为面向过程和面向对象.两者在功能实现上没有区别,但是在代码编写上区别很大,面向过程的代码很乱,不易管理,而面向对象把常用的功能封装为一个类,这样代码清楚多了. 下面举个小例子说明一下: ...

  9. [Sdoi2011]火星移民

    2283: [Sdoi2011]火星移民 Time Limit: 40 Sec  Memory Limit: 512 MBSubmit: 119  Solved: 56[Submit][Status] ...

  10. java学习笔记——数据类型及类型转换

    数据类型分为: 1.引用类型(字符型); 2.基本数据类型(数值型); 以下为基本数据类型介绍(括号内的数字表示该类型所占据的字节数) a.整型 byte(8)   short(16)   int(3 ...