python 爬虫练习代码
爬取豆瓣250代码
#python 爬取豆瓣250电影
#拿到页面源代码 re提取内容
import re
import requests
import csv
url="https://movie.douban.com/top250"
header={
'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
rst = requests.get(url,headers=header)
page_content=rst.text
# print(page_content)
#解析数据
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*? <p class="">(?P<daoyan>.*?) (?P<zhuyan>.*?)<br>(?P<year>.*?) / (?P<cuntry>.*?) / (?P<type>.*?)'
r'</p>.*?<span class="rating_num" property="v:average">(?P<count>.*?)</span>.*?<span class="inq">(?P<say>.*?)</span>',re.S)
result = obj.finditer(page_content)
f = open("movie.csv","w")
csvwriter = csv.writer(f)
for ret in result:
dic=ret.groupdict()
dic['daoyan'] = dic['daoyan'].strip()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print('over')
电影天堂案例
import re
import requests
url = "https://www.dytt8.net/"
rst = requests.get(url)
rst.encoding='gb2312'
obj1 = re.compile(r"最新电影更新:.*?<ul>(?P<hre>.*?)</ul>",re.S)
obj2 = re.compile(r"<a href='(?P<hre>.*?)'>(?P<name>.*?)</a><br/>",re.S)
obj3 = re.compile(r'◎译 名(?P<name>.*?)<br />.*?<a target="_blank" href="(?P<lianjie>.*?)">',re.S)
result = obj1.finditer(rst.text)
for ret in result:
ul = ret.group("hre")
ulhtml = obj2.finditer(ul)
for ullist in ulhtml:
child_href = url + ullist.group('hre').strip("/")
child_rst = requests.get(child_href)
child_rst.encoding='gb2312'
cRes = obj3.finditer(child_rst.text)
for cli in cRes:
print(cli.group('name'))
print(cli.group('lianjie'))
# print(child_rst.text)
# break
获取新发地菜价
#python bs4使用 新发地改了,数据可以post获取,然后换个网站测试
import requests
url ='http://www.xinfadi.com.cn/getPriceData.html'
params={
"limit": "20",
"current": "2",
"pubDateStartTime": "",
"pubDateEndTime": "",
"prodPcatid": "",
"prodCatid": "",
"prodName": "",
}
header ={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
resp = requests.post(url=url,params=params,headers=header)
print(resp.json())
resp.close()
Bs4获取电影天堂相关链接
#bs4使用 新发地网站换了,切回电影天堂测试
import requests
from bs4 import BeautifulSoup
url='https://www.dytt8.net/'
html = requests.get(url)
html.encoding = 'gb2312'
#解析数据
page = BeautifulSoup(html.text,'html.parser')
content = page.find('div',attrs={
'class':'co_content2'
})
lianjies = content.find_all('a')
for lx in lianjies:
print(lx.get('href'))
print(lx.text)
Bs4获美图网首页图片
#umei 优美图库下载图片
import requests
from bs4 import BeautifulSoup
import time
url="https://www.umei.cc/katongdongman/"
html = requests.get(url)
html.encoding = 'gb2312'
page = BeautifulSoup(html.text,'html.parser')
content = page.find('div',attrs={
'class':'TypeList'
})
lianjies = content.find_all('a')
for lx in lianjies:
curl = "https://www.umei.cc" + lx.get('href')
childhtml = requests.get(curl)
childhtml.encoding = 'gb2312'
childpage = BeautifulSoup(childhtml.text,'html.parser')
childcontent = childpage.find('div',attrs={'class':'ImageBody'})
clx = childcontent.find('img').get('src')
img = requests.get(clx)
img_name = clx.split("/")[-1] # 拿到url后缀名
with open(img_name,mode="wb") as f:
f.write(img.content) # 图片内容写入文件
print('over!!!!',img_name)
time.sleep(1)
当当网好评500
import requests
import csv
from bs4 import BeautifulSoup
f = open("book.csv","w")
csvwriter = csv.writer(f)
for i in range(1,26):
url='http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-'+str(i)
html=requests.get(url)
html.encoding = 'gb2312'
page = BeautifulSoup(html.text,'html.parser')
content = page.find('ul',attrs={
'class':'bang_list_mode'
})
books = content.find_all('li')
for book in books:
dict={}
dict['num']= book.find('div',attrs={'class':'list_num'}).text.split('.')[0]
dict['pic']= book.find('div',attrs={'class':'pic'}).find('img').get('src')
dict['name']= book.find('div',attrs={'class':'name'}).find('a').text
dict['star']=book.find('div',attrs={'class':'star'}).find('a').text
dict['tuijian']=book.find('div',attrs={'class':'star'}).find('span',attrs={'class':'tuijian'}).text
dict['price']=book.find('div',attrs={'class':'price'}).find('span',attrs={'class':'price_n'}).text
csvwriter.writerow(dict.values())
# print(content)
f.close()
print('over')
豆瓣网TOP250
# 豆瓣top250
import requests
import csv
from bs4 import BeautifulSoup
f = open("movie.csv","w")
csvwriter = csv.writer(f)
for i in range(0,10):
url= 'https://movie.douban.com/top250?start='+ str(i * 25) +'&filter='
header={
'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
html = requests.get(url,headers=header)
html.encoding='utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lists=soup.find('ol',attrs={'class':'grid_view'}).find_all('li')
for list in lists:
dict={}
dict['num'] = list.find('em').text
dict['name'] = list.find(class_='title').text
dict['pic'] = list.find('img').get('src')
disc = list.find(class_='inq')
if(disc):
dict['disc'] = list.find(class_='inq').text
else:
dict['disc'] = ''
csvwriter.writerow(dict.values())
f.close()
print('over')
获取优美网部分头像
import requests
import csv
from bs4 import BeautifulSoup
import os, sys
import time
#获取子页面Url
def get_page_url():
urls= []
for i in range(1,2):
if(i==1):
baseurl = 'https://www.umei.cc/touxiangtupian/QQtouxiang/index.htm'
else:
baseurl = 'https://www.umei.cc/touxiangtupian/QQtouxiang/index_'+ str(i) +'.htm'
html = requests.get(baseurl)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lists=soup.find('div',attrs={'class':'TypeList'}).find_all('li')
for list in lists:
chilhref = list.find('a').get('href')
urls.append(chilhref)
return urls
#下载图片
def downloadpic(title,img_list):
#新建文件夹
os.mkdir(str(title))
for item in img_list:
img_name = item.split("/")[-1] # 拿到url后缀名
filename = '%s/%s' % (title,img_name)
con = requests.get(item)
with open(filename,mode="wb") as f:
f.write(con.content) # 图片内容写入文件
#获取详情图url
def download(url):
baseurl = "https://www.umei.cc/" + str(url)
html = requests.get(baseurl)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lastnum = soup.find(class_='NewPages').find_all('a')[-1].get('href')
num=(lastnum.split('_')[-1]).split('.')[0]
img_list=[]
title_name = soup.find(class_='ArticleTitle').find('strong').text
name = title_name.split(" ")[0]
for i in range(1,int(num)):
if(i==1):
curl = baseurl
else:
curl = baseurl.split('.htm')[0] +"_" + str(i) + ".htm"
chtml = requests.get(curl)
chtml.encoding = 'utf-8'
csoup = BeautifulSoup(chtml.text,'html.parser')
img_list.append(csoup.find(class_='ImageBody').find('img').get('src'))
downloadpic(name,img_list)
# print(get_page_url())
for url in get_page_url():
download(url)
多线程获取妹子图
import requests
import csv
from bs4 import BeautifulSoup
import os, sys
import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Executor
from concurrent import futures
#获取子页面Url
def get_page_url():
urls= []
for i in range(1,2):
if(i==1):
baseurl = 'https://www.umei.cc/meinvtupian/index.htm'
else:
baseurl = 'https://www.umei.cc/meinvtupian/index_'+ str(i) +'.htm'
html = requests.get(baseurl)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lists=soup.find('div',attrs={'class':'TypeList'}).find_all('li')
for list in lists:
chilhref = list.find('a').get('href')
urls.append(chilhref)
return urls
#下载图片
def downloadpic(title,img_list):
#新建文件夹
os.mkdir(str(title))
for item in img_list:
img_name = item.split("/")[-1] # 拿到url后缀名
filename = '%s/%s' % (title,img_name)
con = requests.get(item)
print(filename+"...下载中")
with open(filename,mode="wb") as f:
f.write(con.content) # 图片内容写入文件
#获取详情图url
def download(url):
baseurl = "https://www.umei.cc/" + str(url)
html = requests.get(baseurl)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lastnum = soup.find(class_='NewPages').find_all('a')[-1].get('href')
num=(lastnum.split('_')[-1]).split('.')[0]
img_list=[]
title_name = soup.find(class_='ArticleTitle').find('strong').text
name = title_name.split(" ")[0]
for i in range(1,int(num)+1):
if(i==1):
curl = baseurl
else:
curl = baseurl.split('.htm')[0] +"_" + str(i) + ".htm"
chtml = requests.get(curl)
chtml.encoding = 'utf-8'
csoup = BeautifulSoup(chtml.text,'html.parser')
img_list.append(csoup.find(class_='ImageBody').find('img').get('src'))
downloadpic(name,img_list)
def download_all_images(list_page_urls):
# 获取每一个详情妹纸
works = len(list_page_urls)
with futures.ThreadPoolExecutor(works) as exector:
for url in list_page_urls:
exector.submit(download,url)
download_all_images(get_page_url())
改善豆瓣抓取,增加多进程
import requests
import csv
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import os
import threading
import multiprocessing
def main(urls):
header={
'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
html = requests.get(urls,headers=header)
html.encoding='utf-8'
soup = BeautifulSoup(html.text,'html.parser')
lists=soup.find('ol',attrs={'class':'grid_view'}).find_all('li')
for list in lists:
dict={}
dict['num'] = list.find('em').text
dict['name'] = list.find(class_='title').text
dict['pic'] = list.find('img').get('src')
disc = list.find(class_='inq')
if(disc):
dict['disc'] = list.find(class_='inq').text
else:
dict['disc'] = ''
print(dict)
if __name__ == '__main__':
urls = []
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for i in range(0,10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
urls.append(url)
# main(urls)
pool.map(main,urls)
pool.close()
pool.join()
获取妹子图2
妹子图晚上做了ip请求次数跟时间间隔处理,暂时先放这
import requests
import csv
from bs4 import BeautifulSoup
import os, sys
import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Executor
from concurrent import futures
# def get_page_url():
# urls= []
# baseurl = 'https://www.mzitu.com/'
# headers = {
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
# }
# html = requests.get(baseurl,headers=headers)
# soup = BeautifulSoup(html.text,'html.parser')
# lists=soup.find('div',attrs={'class':'postlist'}).find_all('li')
# for list in lists:
# chilhref = list.find('a').get('href')
# urls.append(chilhref)
# return urls
def downloadpic(title,img_list):
#新建文件夹
os.mkdir(str(title))
headers = {
"Referer": "https://www.mzitu.com/",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
for item in img_list:
img_name = item.split("/")[-1] # 拿到url后缀名
filename = '%s/%s' % (title,img_name)
con = requests.get(item,headers=headers)
print(filename+"...下载中")
with open(filename,mode="wb") as f:
f.write(con.content) # 图片内容写入文件
# #获取详情图url
def get_child_pic(baseurl):
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
html = requests.get(baseurl,headers=headers)
soup = BeautifulSoup(html.text,'html.parser')
# print(soup)
lastnum = soup.find(class_='pagenavi').find_all('a')[-2].find('span').text
img_list=[]
name = soup.find(class_='main-title').text
print(name)
for i in range(1,int(1)+1):
curl = baseurl+'/'+str(i)
chtml = requests.get(curl,headers=headers)
csoup = BeautifulSoup(chtml.text,'html.parser')
img_list.append(csoup.find(class_='main-image').find('img').get('src'))
downloadpic(name,img_list)
get_child_pic("https://www.mzitu.com/254266")
# print(get_page_url())
# def download_all_images(list_page_urls):
# # 获取每一个详情妹纸
# works = len(list_page_urls)
# with futures.ThreadPoolExecutor(works) as exector:
# for url in list_page_urls:
# exector.submit(download,url)
# download_all_images(get_page_url())