"简单的Python爬虫程序"

a simple web crawler with python

Posted by xuepro on May 5, 2018
#Importing the modules

#import urllib.request
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.request import URLError
from urllib import request  
import urllib

import json
import re  

def extract(text, sub1, sub2):
    """
    extract a substring from text between first
    occurances of substrings sub1 and sub2
    """
    return text.split(sub1, 1)[-1].split(sub2, 1)[0]

url =  "https://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E6%9A%B4%E8%B5%B0%E6%BC%AB%E7%94%BB&ct=201326592&ic=0&lm=-1&width=&height=&v=flip" 

word = input("Input key word: ")
word=urllib.parse.quote(word)
url = ''.join(["https://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=",word,"&ct=201326592&ic=0&lm=-1&width=&height=&v=flip"])

headers = {'User-Agent': 'User-Agent:Mozilla/5.0'}
req = request.Request(url, headers=headers)  
data = urlopen(req)  
##data = urlopen(url)  
charset=data.info().get_content_charset()
html = data.read()

encoding = extract(str(html).lower(), 'charset=', '"')
print('-'*50)
print( "Encoding type = %s" % encoding )
print('-'*50)

html = html.decode(encoding)


#data.close()  
  
#reg = r'src="(.+?\.jpg)"'  
#imgre = re.compile(reg)  
#imglist = re.findall(imgre, html) 

imglist = re.findall('"objURL":"(.*?)",',html,re.S)
  
for x, imgurl in enumerate(imglist):  
    print(imgurl)
    try:
        urlretrieve(imgurl, "%s.jpg" % x) 
    except URLError as e:
        if e.code == 404:
            print("4 0 4")
        else:
            print("%s" % e)  
            continue

参考:python爬虫实战——5分钟做个图片自动下载器


支付宝打赏 微信打赏

您的打赏是对我最大的鼓励!