java简单实现爬虫

思路

主要先是通过java实现http请求获取到我们需要下载图片的网址,然后根据图片的网址通过字节写入本地文件夹中

代码

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Getphoto {
	private static ArrayList<String> urls = new ArrayList<String>();
	//集合 存储
	private static String url_end=null;
	//最后匹配得到的图片url(需拼接)
	private static int	z=1;
	//图片名字和计次
	
	public static void httpget(String geturl) {
		//构造httpget请求函数 
		try {			
			HttpURLConnection urlConnection = null;			
			URL url = new URL(geturl);
			//url对象初始化			
			urlConnection  = (HttpURLConnection)url.openConnection();
			//建立连接
			urlConnection.setConnectTimeout(5000);
			//连接超时设置5s
			urlConnection.setReadTimeout(5000);
			//读取超时5s  获取响应
			urlConnection.setUseCaches(false);
			//不使用缓存
			urlConnection.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3706.400 SLBrowser/10.0.4040.400" );
			//添加协议头 请求需要时可以添加其它
			urlConnection.connect();	
			BufferedReader html =new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"GBK"));
			//编码为gbk
			String line = html.readLine();
			//读取下一行内容
			while(line!=null) {
				//循环遍历每一遍的内容
				Matcher matchingid = Pattern.compile("<a href=\"/tupian/(.*?).html").matcher(line);
				//正则匹配id	<a href=\"/tupian/(.*?).html为正则表达式		line为被匹配的数据
				while(matchingid.find()) {
					urls.add(matchingid.group(1));
					//matchingid.group(1)为匹配的文本
				}	
				Matcher matchingurl_end = Pattern.compile("<img src=\"(.*?)\" data-pic").matcher(line);
				while(matchingurl_end.find()) {
					url_end = matchingurl_end.group(1);
				}
				line = html.readLine();
			}
		}catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
	}

	
	private static void downloadPicture(String urlList,String path) {
		//构造根据图片url下载图片函数
        URL url = null;
        try {
            url = new URL(urlList);
          //url对象初始化
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            //初始化数据输入流读数据    url.openStream()获取url资源的输入流
            FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
            //初始化输出流写数据
            ByteArrayOutputStream output = new ByteArrayOutputStream();
            //初始化字节数组输出流,把数据写入到自己的数组中,数组的大小会随着数据的不断增加而自动增长
            byte[] buffer = new byte[1024];
            int length;
            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            fileOutputStream.write(output.toByteArray());
            dataInputStream.close();//关闭流
            fileOutputStream.close();//关闭流
        } catch (Exception e) {
            e.printStackTrace();
        } 
    }
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String url = ""; 
		int sore=11;
		File file ;
		for(int i=1;i<sore;i++) {
			if(i==1) {
				url = "http://pic.netbian.com/index.html";
			}else if(i>1) {
				url = "http://pic.netbian.com/index_"+i+".html";
			}
			httpget(url);
			for(int j = 0;j<urls.size();j++) {
				httpget("http://pic.netbian.com/tupian/"+urls.get(j)+".html");
				file = new File("C:\\Users\\Lenovo\\Desktop\\images");
				//创建文件夹对象
		        if (!file.exists()) { 
		        	//如果文件夹不存在  则创建一个空的文件夹
		            file.mkdirs();
		        }
		        downloadPicture("http://pic.netbian.com"+url_end, "C:\\Users\\Lenovo\\Desktop\\images\\"+z+".jpg");
				z = z + 1;
				if(j==20) {
					urls.removeAll(urls);//清空集合
				}
			}		
		}	
	}

	
}

评论

  1. 4年前
    2020-9-14 12:33:30

    牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛牛

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇