在git clone完项目后,发现一个很诡异的现象,JewelCrawler每次都是爬取种子地址,并没有一次查询数据库中crawled字段为0的记录进行一一爬取,但是之前在本机上是完美运行的,可能是在push代码前做了改动影响运行了。
- //set boolean value "crawled" to true after crawling this page
- sql = "UPDATE record SET crawled = 1 WHERE URL = '" + url + "'";
- stmt = conn.createStatement();
- if (stmt.executeUpdate(sql) > 0) {
- //get the next page that has not been crawled yet
- sql = "SELECT * FROM record WHERE crawled = 0";
- stmt = conn.createStatement();
- rs = stmt.executeQuery(sql);
- if ( {
- url = rs.getString(2);
- } else {
- //stop crawling if reach the bottom of the list
- break;
- }
- //set a limit of crawling count
- if (count > Constants.maxCycle || url == null) {
- break;
- }
- }
执行stmt.executeUpdate(sql) > 0是返回的值为0,从而不会从数据库中读取crawled为0的记录,最后就一直在while的循环中爬取种子网站。
解决方法:对于种子网站既然没有存储到record的操作,那么就对种子网站做特殊处理,将if的判断条件改为if (stmt.executeUpdate(sql) > 0 || frontPage.equals(url)),这样对于种子网站即使没有update更新成功操作仍然可以进入读取数据库crawled为0 的操作。
- public static void parseFromString(String content, Connection conn) throws Exception {
- Parser parser = new Parser(content);
- HasAttributeFilter filter = new HasAttributeFilter("href");
- String sql1 = null;
- ResultSet rs1 = null;
- PreparedStatement pstmt1 = null;
- Statement stmt1 = null;
- List<String> nextLinkList = new ArrayList<String>();
- int rowCount = 0;
- sql1 = "select count(*) as rowCount from record";
- stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
- rs1 = stmt1.executeQuery(sql1);
- if ( {
- rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
- }
- if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
- try {
- NodeList list = parser.parse(filter);
- int count = list.size();
- //process every link on this page
- for (int i = 0; i < count; i++) {
- Node node = list.elementAt(i);
- if (node instanceof LinkTag) {
- LinkTag link = (LinkTag) node;
- String nextLink = link.extractLink();
- String mainUrl = Constants.MAINURL;
- if (nextLink.startsWith(mainUrl)) {
- //check if the link already exists in the database
- sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
- stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
- rs1 = stmt1.executeQuery(sql1);
- if ( {
- } else {
- Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
- Matcher movieMatcher = moviePattern.matcher(nextLink);
- Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
- Matcher commentMatcher = commentPattern.matcher(nextLink);
- if (movieMatcher.find() || commentMatcher.find()) {
- nextLinkList.add(nextLink);
- }
- }
- }
- }
- }
- if (nextLinkList.size() > 0) {
- conn.setAutoCommit(false);
- //if the link does not exist in the database, insert it
- sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
- pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
- for (String nextLinkStr : nextLinkList) {
- pstmt1.setString(1, nextLinkStr);
- pstmt1.addBatch();
- System.out.println(nextLinkStr);
- }
- pstmt1.executeBatch();
- conn.commit();
- }
- } catch (Exception e) {
- //handle the exceptions
- e.printStackTrace();
- System.out.println("SQLException: " + e.getMessage());
- } finally {
- //close and release the resources of PreparedStatement, ResultSet and Statement
- if (pstmt1 != null) {
- try {
- pstmt1.close();
- } catch (SQLException e2) {
- }
- }
- pstmt1 = null;
- if (rs1 != null) {
- try {
- rs1.close();
- } catch (SQLException e1) {
- }
- }
- rs1 = null;
- if (stmt1 != null) {
- try {
- stmt1.close();
- } catch (SQLException e3) {
- }
- }
- stmt1 = null;
- }
- }
- }
3. 在批量操作中,使用了addBatch()方法和executeBatch()方法,注意需要添加conn.setAutoCommit(false);以及conn.commit()表示手动提交。
- public static void main(String args[]) throws Exception {
- //load and read seed file
- List<String> seedList = LoadSeed.loadSeed();
- if (seedList == null) {
-"No seed to crawl, please check again");
- return;
- }
- String frontPage = seedList.get(0);
- //connect database mysql
- Connection conn = DBUtils.connectDB();
- //create tables to store crawled data
- DBUtils.createTables();
- String sql = null;
- String url = frontPage;
- Statement stmt = null;
- ResultSet rs = null;
- int count = 0;
- List<String> urlList = new ArrayList<String>();
- urlList.add(url);
- //crawl every link in the database
- while (true) {
- //get page content of link "url"
- DouBanHttpGetUtil.getByString(urlList, conn);
- count++;
- //set boolean value "crawled" to true after crawling this page
- //TODO batch update
- int result = 0;
- conn.setAutoCommit(true);
- for (String urlStr : urlList) {
- sql = "UPDATE record SET crawled = 1 WHERE URL = '" + urlStr + "'";
- stmt = conn.createStatement();
- stmt.executeUpdate(sql);
- }
- urlList.clear();//empty for every loop
- if (stmt.executeUpdate(sql) > 0 || frontPage.equals(url)) {
- //get the next page that has not been crawled yet
- sql = "SELECT * FROM record WHERE crawled = 0 limit 10";
- stmt = conn.createStatement();
- rs = stmt.executeQuery(sql);
- while ( {
- url = rs.getString(2);
- urlList.add(url);
- }
- //set a limit of crawling count
- if ( || count > Constants.maxCycle || url == null) {
- break;
- }
- }
- }
- conn.close();
- conn = null;
- System.out.println("Done.");
- System.out.println(count);
- }
注意: 1.这里采用每次读取10条记录,相应的也需要将这10条记录的crawled字段更新为1,表示爬取过。
2. mysql不支持top 10 * 这样的语法,但是可以通过代码中所示的limit 10 的方式取出数据。
3. 添加conn.setAutoCommit(true);表示更新操作设置为自动提交,这样就可以解决虽然程序执行成功但是数据没有更新到数据库的现象。
