- //趁着有空回头复习了一把正则表达式
以下代码以百度某个贴吧的 URL 作为源,实现了读取 EmailAddress 并写入文件保存起来的两个功能,如果要爬取其它信息,可以改写正则实现相应功能
1.应用到 IO 读写缓冲字符流
3.URL 对象获取网页信息
4.util 包的集合框架 ArrayList
import java.io.*;- import java.util.regex.*;
- import java.net.*;
- import java.util.*;
- class Spider{
- public static void main(String [] args) throws Exception{
- URL url=new URL("http://tieba.baidu.com/p/2314539885");
- //String [] emailAddress=
- ArrayList<String> emailList=getEmailByURL(url);
- for(String emailAddress:emailList){
- System.out.println(emailAddress);
- }
- String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt";
- writeEmailToFile(qualifiedName,emailList);
- }
- // Read html from url
- public static ArrayList<String> getEmailByURL(URL url) throws Exception{
- URLConnection urlconn=url.openConnection();
- BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
- //regex match pattern
- String regex="\\w+@\\w+(\\.\\w+)+";
- Pattern p=Pattern.compile(regex);
- ArrayList<String> emailList=new ArrayList<String>();
- String line=null;
- while((line=bufreader.readLine())!=null){
- Matcher m=p.matcher(line);
- // Start to iterator the one matches
- while(m.find()){
- emailList.add(m.group());
- //System.out.println(m.group());
- }
- //System.out.println(line);
- }
- return emailList;
- }
- public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{
- BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName));
- for(int i=0;i<emailList.size();i++){
- bufwriter.write(emailList.get(i));
- bufwriter.newLine();
- bufwriter.flush();
- }
- }
- }
- /*读取键盘输入的 三种形式 */
- import java.io.*;
- import java.util.*;
- import java.util.Scanner;
- class MyTest{
- public static void main(String [] args) throws Exception{
- //方法一
- Scanner scanner =new Scanner(System.in);
- String inputStr=scanner.nextLine();
- System.out.println(inputStr);
- //方法二
- BufferedReader bufreader=new BufferedReader(new InputStreamReader(System.in));
- String line=null;
- while((line=bufreader.readLine())!=null){
- System.out.println(line);
- }
- }
- }
- /* 把叠词 简化 */
- class AbrreviateDemo{
- public static void main(String [] args){
- String str="II...LLL...ove..ee.....you!";
- String regex="\\.+";
- String replaceStr="";
- str=retriveStr(str,regex,replaceStr);
- regex="(.)\\1+";
- replaceStr="$1";
- str=retriveStr(str,regex,replaceStr);
- System.out.println(str);
- }
- public static String retriveStr(String str,String regex,String replaceStr){
- return str.replaceAll(regex,replaceStr);
- }
- }
- /* 将一堆杂乱的 IP 地址进行排序 */
- import java.util.*;
- class SortIP{
- public static void main(String [] args){
- String IP="";
- printAfterSort(IP);
- }
- public static void printAfterSort(String str){
- String regex="(0*\\d+)";
- str=str.replaceAll(regex,"00$1");
- regex="0*(\\d{3})";
- str=str.replaceAll(regex,"$1");
- System.out.println(str);
- regex=" +";
- String [] strArray=str.split(regex);
- Arrays.sort(strArray);
- for(int i=0;i<strArray.length;i++){
- System.out.println(strArray[i].replaceAll("0*(\\d+)","$1"));
- }
- //System.out.println(str);
- }
- }
- /* 邮箱地址校验 */
- class checkMailDemo{
- public static void main(String [] args){
- String str="liyu@gchchina.com.cn";
- System.out.println("result: "+checkMail(str));
- //String regex="";
- }
- public static boolean checkMail(String str){
- String regex="[a-zA-Z0-9_]+[@][a-zA-Z0-9]+(\\.[a-zA-Z]+){1,3}";
- regex="\\w+@\\w+(\\.\\w+){1,3}";
- return str.matches(regex);
- }
- }
- /* 从一堆杂乱的字符串中获取需要的手机号码 */
- import java.util.regex.*;
- class RegexDemo{
- public static void main(String [] args){
- String str="1afasdf13874057617weojfjlj";
- String regex="[1-9][3,5,8]\\d{9}";
- retriveStr(str,regex);
- }
- public static void retriveStr(String str,String regex){
- Pattern p=Pattern.compile(regex);
- Matcher m=p.matcher(str);
- while(m.find()){
- String tempstr=m.group();
- System.out.println(tempstr);
- }
- }
- }
- /* 读取键盘标准输入流并大写方式打印到控制台 */
- import java.io.*;
- import java.util.*;
- class UpercaseSystemIn{
- public static void main(String [] args) throws IOException{
- InputStream in=System.in;
- doUpcaseReadIn(in);
- }
- public static void doUpcaseReadIn(InputStream in) throws IOException{
- BufferedReader bufr=new BufferedReader(new InputStreamReader(in));
- String str=null;
- while((str=bufr.readLine())!=null){
- System.out.println(str.toUpperCase());
- if(str.equalsIgnoreCase("exit")) System.exit(0); //break
- }
- }
- }
- /* 读取某个贴吧邮箱地址并打印到控制台 注意这里的正则*/
- import java.net.*;
- import java.io.*;
- import java.util.*;
- import java.util.regex.*;
- class SpiderTest{
- public static void main(String [] args) throws Exception{
- URL url=new URL("http://tieba.baidu.com/p/2314539885");
- getEmailAddressFromURL(url);
- }
- public static void getEmailAddressFromURL(URL url) throws Exception{
- URLConnection urlconn=url.openConnection();
- BufferedReader bufreader = new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
- String line=null;
- String regex="\\w+@\\w+(\\.\\w+)+";
- Pattern p=Pattern.compile(regex);
- while ((line=bufreader.readLine())!=null){
- Matcher m = p.matcher(line);
- while(m.find()){
- System.out.println(m.group());
- }
- // System.out.println(line);
- }
- }
- }
- /* 实现本地二进制文件拷贝 */
- import java.io.*;
- class CopyImg{
- public static void main(String [] args){
- BufferedInputStream bufinps=null;
- BufferedOutputStream bufotps=null;
- try{
- bufinps=new BufferedInputStream(new FileInputStream("psb.jpg"));
- bufotps=new BufferedOutputStream(new FileOutputStream("psb_copy.jpg"));
- byte [] buf=new byte[8192];
- int len=0;
- while((len=bufinps.read(buf))>0){
- bufotps.write(buf,0,len);
- //bufotps.flush();
- }
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- finally{
- if(bufinps!=null)
- try{
- bufinps.close();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- if(bufotps!=null)
- try{
- bufotps.close();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- }
- }
- }
- /* 从某个网页爬取符合规则的邮箱地址并保存到本地磁盘路径下 */
- import java.io.*;
- import java.util.regex.*;
- import java.net.*;
- import java.util.*;
- class Spider{
- public static void main(String [] args) throws Exception{
- URL url=new URL("http://tieba.baidu.com/p/2314539885");
- //String [] emailAddress=
- ArrayList<String> emailList=getEmailByURL(url);
- for(String emailAddress:emailList){
- System.out.println(emailAddress);
- }
- String qualifiedName="c://users//ghc//desktop//test//emailAddress.txt";
- writeEmailToFile(qualifiedName,emailList);
- }
- // Read html from url
- public static ArrayList<String> getEmailByURL(URL url) throws Exception{
- URLConnection urlconn=url.openConnection();
- BufferedReader bufreader =new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
- //regex match pattern
- String regex="\\w+@\\w+(\\.\\w+)+";
- Pattern p=Pattern.compile(regex);
- ArrayList<String> emailList=new ArrayList<String>();
- String line=null;
- while((line=bufreader.readLine())!=null){
- Matcher m=p.matcher(line);
- // Start to iterator the one matches
- while(m.find()){
- emailList.add(m.group());
- //System.out.println(m.group());
- }
- //System.out.println(line);
- }
- return emailList;
- }
- public static void writeEmailToFile(String qualifiedName,ArrayList<String> emailList) throws Exception{
- BufferedWriter bufwriter=new BufferedWriter(new FileWriter(qualifiedName));
- for(int i=0;i<emailList.size();i++){
- bufwriter.write(emailList.get(i));
- bufwriter.newLine();
- bufwriter.flush();
- }
- }
- }
- /* 从某个网页爬取图片的 URL 地址然后 进行 下载到本地磁盘路径 基本功能已经实现,但是正则需要自行调整 */
- import java.net.*;
- import java.io.*;
- import java.util.regex.*;
- import java.util.*;
- class ImgSpider{
- public static void main(String [] args){
- saveImgFromURL("http://image.baidu.com/","c:/users/ghc/desktop/test/");
- System.gc();
- }
- public static boolean downLoadImg(String line,String path){
- boolean flag=true;
- FileOutputStream fos=null;
- BufferedInputStream bufinpts=null;
- BufferedOutputStream bufopts=null;
- path=path.replace("<","");
- /* System.out.println(line);
- System.out.println(path); */
- try{
- bufinpts=new BufferedInputStream((new URL(line)).openConnection().getInputStream());
- fos=new FileOutputStream(path);
- bufopts=new BufferedOutputStream(fos);
- byte [] buf=new byte[1024];
- int len=-1;
- while((len=bufinpts.read(buf))!=-1){
- bufopts.write(buf,0,buf.length);
- }
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- flag=false;
- }
- finally{
- if(bufopts!=null)
- try{
- bufopts=null;
- bufopts.close();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- if(fos!=null)
- try{
- fos=null;
- fos.close();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- }
- return flag;
- }
- public static boolean saveImgFromURL(String urlStr,String folder){
- boolean flag=true;
- URL url=null;
- //InputStream in=null;
- String line=null;
- BufferedReader bufr=null;
- Pattern p=null;
- Matcher m=null;
- ArrayList<String> imgList=null;
- try{
- url=new URL(urlStr);
- URLConnection urlconn=url.openConnection();
- bufr=new BufferedReader(new InputStreamReader(urlconn.getInputStream()));
- imgList=new ArrayList<String>();
- String regex="<img.*src=(.*?)[^>]*?>";
- p=Pattern.compile(regex);
- while((line=bufr.readLine())!=null){
- m=p.matcher(line);
- while(m.find()){
- System.out.println(m.group());
- imgList.add(m.group());
- }
- //System.out.println(line);
- }
- Iterator<String> it=imgList.iterator();
- while(it.hasNext()){
- line=it.next();
- folder+=line.substring(line.lastIndexOf("/",2) + 1,
- 3)+".png";
- //http://www.jb51.net/images/logo.gif
- m=Pattern.compile("http://(\\w+\\.)+[a-z]+/images/(\\w+\\.)+[a-z]{3}").matcher(line);
- while(m.find()){
- line=m.group();
- //System.out.println(line);
- downLoadImg(line,folder);
- }
- }
- }
- catch(MalformedURLException mfe){
- mfe.printStackTrace();
- flag=false;
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- flag=false;
- }
- finally{
- if (bufr!=null)
- try{
- bufr=null;
- bufr.close();
- }
- catch(IOException ie){ ie.printStackTrace();
- }
- }
- return flag;
- }
- }
- /*正则 小练习 */
- class Demo{
- public static void main(String [] args){
- String qq="1212345";
- boolean checkResult=checkQQ(qq);
- System.out.println(checkResult ? qq+" is right": qq+" is wrong!!!");
- String telnumber="15974097817";
- checkResult=checkTel(telnumber);
- System.out.println(checkResult ? telnumber+" is right": telnumber+" is wrong!!!");
- String path="c:\\users\\frank\\abqqcdkkkefghhijkkkkkl.txt";
- String regex="(.)\\1+"; //叠词切割注意引入组的概念,\n 代表引用第几组 + 出现1次或多次 qq 或者 kkk 均会被当作切割符
- printAfterSplit(path,regex);
- System.out.println("=================");
- regex="\\.";
- printAfterSplit(path,regex);
- String str="abcddeffffg";
- regex="(.)\\1{3,}";
- String replaceStr="$1";
- printAfterReplaceStr(str,regex,replaceStr);
- }
- // 以下两个均是正则匹配 校验字符串的函数
- public static boolean checkTel(String telnumber){
- String regex="[1][3,5,8]\\d{9}";
- return telnumber.matches(regex);
- }
- public static boolean checkQQ(String qq){
- //boolean result=false;
- String regex="[1-9][0-9]{4,14}";
- regex="[1-9]\\d{4,14}";
- return qq.matches(regex);
- /* int len=qq.length();
- if(len<5 || len>15 || qq.startsWith("0")){
- System.out.println("length or startWith issue!!!");
- //return result;
- }
- else {
- char [] ary=qq.toCharArray();
- for(int i=0;i<ary.length;i++){
- if(!(ary[i]>='0' && ary[i]<='9'))
- {
- System.out.println("not between 0 and 9 !!!");
- break;
- //return result;
- }
- else
- result=true;
- }
- } */
- //return result;
- }
- // 以下两个均是正则 切割字符串的函数
- public static void printAfterSplit(String path,String regex){
- String [] ary=path.split(regex);
- for(String s:ary){
- System.out.println(s);
- }
- }
- // 以下两个均是自定义正则替换字符串函数
- public static void printAfterReplaceStr(String str,String regex,String replaceStr){
- String resultStr=str.replaceAll(regex,replaceStr);
- System.out.println(resultStr);
- }
- }
