这篇文章是接着上篇文章的,前段时间太忙未更新,上篇文章说道海报下载及命名,按照上篇文章下载的海报会有问题就是海报破损,无法打开
后面是我跟新的
#!/usr/bin/env python
-- coding: utf-8 --
@Time : /6/10 22:35
@Author : LJ
@Site :
@File : doubantest.py
@Software: PyCharm
import re
import requests
from requests import RequestException
from demo01.util import buid_proxy
from urllib.parse import urlencode
import json
import time
import codecs
import os
proxies=buid_proxy()
headers={
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36”,
“Referer”:“/”
}
def get_one_page():
#构造真实url
paramete = {
‘type’: ‘movie’,
‘tag’: ‘热门’,
‘page_limit’: ‘50’,
‘page_start’: 0
}
url='/j/search_subjects?’+urlencode(paramete)
#print(url)#最好打印一下,自己构造是否正确
try:
res=requests.get(url,headers=headers,proxies=proxies)
if res.status_code==200:
html=res.text
html=json.loads(html)#
return htmlreturn Noneexcept RequestException:return None
def parse_one_page(data):
if data and ‘subjects’ in data.keys():#真实的url是一个大字典,subjects是这个字典key值,我们需要的数据都是这个value
# print(data)
move_info_d=[]
if data !=None:#因为我迭代的时候出现空值,空值不能被迭代,会一直报错
for item in data.get(‘subjects’):
#print(item)
movie_id=item.get(‘id’)
title=item.get(‘title’)
movie_url=item.get(‘url’)
img_url=item.get(‘cover’)
#print(img_url)
move_info={#
“movie_id”:movie_id,
“movie_name”:title,
“movie_url”:movie_url,
“img_url”:img_url
}
# print(move_info)
move_info_d.append(move_info)
return move_info_d
def save_to_file(content):
#print(content)
path = ‘E:/test001/photo/douban/nowplaying%s.json’%time.strftime("%Y-%m-%d")
content=json.dumps(content,ensure_ascii=False)
with open(path,‘wb’) as fs:
fs.write(str.encode(content))
fs.close()
path=‘E:/test001/photo/douban/’
def save_img_to_file(data):
if data and ‘subjects’ in data.keys():#真实的url是一个大字典,subjects是这个字典key值,我们需要的数据都是这个value
# print(data)
move_info_d=[]
if data !=None:#因为我迭代的时候出现空值,空值不能被迭代,会一直报错
for item in data.get(‘subjects’):
#print(item)
title=item.get(‘title’)
img_url=item.get(‘cover’)#获取海报url
img_path=path+title+’.jpg’#这个是编辑海报名称
res=requests.get(img_url,proxies=proxies,headers=headers)
img_content=res.content
with open(img_path,‘wb’) as img_f:
img_f.write(img_content)
time.sleep(2)
img_f.close()
def main():
data=get_one_page()
#print(data)
content=parse_one_page(data)
#print(content)
file_=save_to_file(content)
#print(file_)
save_img_to_file(data)
# print(img_cont)
print(‘下载完成。。。。。’)
ifname==‘main’:
main()
这样就可以同时下载豆瓣豆瓣的热门电影和热门电影嗨爆了