900字范文 > python判断对错题_Python爬虫自动化获取华图和粉笔网站的错题(推荐)

python判断对错题_Python爬虫自动化获取华图和粉笔网站的错题(推荐)

时间：2020-10-07 08:48:22

这篇博客对于考公人或者其他用华图或者粉笔做题的人比较友好，通过输入网址可以自动化获取华图以及粉笔练习的错题。

粉笔网站

我们从做过的题目组中获取错题

打开某一次做题组，我们首先进行抓包看看数据在哪里

我们发现现在数据已经被隐藏，事实上数据在这两个包中：

/api/xingce/questions

/api/xingce/solutions

一个为题目的一个为解析的。此url要通过传入一个题目组参数才能获取到当前题目数据，而题目组参数在这个包中

以网址的倒数第二个数字串有关

url的规则为'/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0'，id_即为下划线数字

通过请求这个包获取到参数然后通过参数请求上面两个包(

/api/xingce/questions

/api/xingce/solutions

不过粉笔的题目数据有些是图片，而且图片在题目中，选项中，这里以word文档存储操作docx库有些吃力，于是我想到了直接构造HTML代码，然后通过pdfkit转为pdf(具体如何下载可以参考百度，要下载wkhtmltopdf.exe)即可变为错题集在平板或者其他设备中看。

(请求时一定要携带完整的headers，否则很可能获取不到数据)

具体操作看代码解析

###此函数用于解析题目和每道题的答案

def jiexi(liebiao):

new = []

timu_last = []

for each in liebiao:

new.append(re.sub(r'flag=\\"tex\\" ','',each))

for each in new:

timu_last.append(re.sub(r'\\','',each))

return timu_last

###此函数用于解析选项

def xuanxiang(liebiao):

xuanxiang_v2 = []

xuanxiang_v3 = []

for each in liebiao:

a = re.sub('

','',each)

a = re.sub('

','',a)

xuanxiang_v2.append(a)

for each in xuanxiang_v2:

each = each+'

xuanxiang_v3.append(each)

return xuanxiang_v3

import requests

import re

import pdfkit

import os

url = str(input("请输入练习的网址："))

###获取本节练习id

id_ = re.findall(r'/spa/tiku.*?/xingce/xingce/(.*?)/',url,re.S)[0]

mid_url = '/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0'

headers = {

#####完整的headers，自己添加

}

response = requests.get(url=mid_url,headers=headers)

response.encoding = 'utf-8'

page_text = response.text

###获取题目组参数

id_list = re.findall('\"questionIds\"\:\[(.*?)\]\,',page_text,re.S)

###获取自己的答案

your_answer = re.findall(r'"answer":{"choice":"(.*?)",',page_text,re.S)

###此练习名称

name = re.findall(r'"name":"(.*?)",',page_text,re.S)[0]

###真正存储数据的包

timu_url = '/api/xingce/questions'

params = {

'ids': id_list

}

response = requests.get(url=timu_url,headers=headers,params=params)

response.encoding = 'utf-8'

page_text = response.text

###获取正确答案

true_answer = re.findall('"correctAnswer":{"choice":"(.*?)"',page_text,re.S)

###真正存储数据的包

solution_url = '/api/xingce/solutions'

response = requests.get(url=solution_url,headers=headers,params=params)

response.encoding = 'utf-8'

page_text = response.text

###获取解析

solution_list = re.findall(r'"solution":"(.*?)","userAnswer"',page_text,re.S)

solution_last = jiexi(solution_list)

cailiao = []

timu = []

###获取单选题题目和复合题的题目

for each in response.json():

timu.append(each['content'])

try:

cailiao.append(each['material']['content'])

except:

cailiao.append('none')

###获取选项信息

A_option = re.findall('\"options\"\:\[\"(.*?)\"\,\".*?\"\,\".*?\"\,\".*?\"\]',page_text,re.S)

B_option = re.findall('\"options\"\:\[\".*?\"\,\"(.*?)\"\,\".*?\"\,\".*?\"\]',page_text,re.S)

C_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\"(.*?)\"\,\".*?\"\]',page_text,re.S)

D_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\".*?\"\,\"(.*?)\"\]',page_text,re.S)

A_option = xuanxiang(A_option)

B_option = xuanxiang(B_option)

C_option = xuanxiang(C_option)

D_option = xuanxiang(D_option)

A_option = jiexi(A_option)

B_option = jiexi(B_option)

C_option = jiexi(C_option)

D_option = jiexi(D_option)

###构造HTML代码

count = 0

all_content = "\n\n"

for each in true_answer:

if each != your_answer[count]:

###处理复合题

if cailiao[count] != 'none' and cailiao[count] not in all_content:

all_content += cailiao[count]

all_content += str(count+1)

all_content += '、'

all_content += timu[count][3:]

all_content += 'A、'

all_content += A_option[count]

all_content += 'B、'

all_content += B_option[count]

all_content += 'C、'

all_content += C_option[count]

all_content += 'D、'

all_content += D_option[count]

all_content += '

count += 1

count = 0

all_content += '

for each in true_answer:

if each != your_answer[count]:

temp = '第'+str(count+1)+'题的正确答案为'

all_content += temp

if true_answer[count]=='0':

all_content += 'A'

elif true_answer[count]=='1':

all_content += 'B'

elif true_answer[count]=='2':

all_content += 'C'

elif true_answer[count]=='3':

all_content += 'D'

all_content += solution_last[count]

all_content += '

count += 1

all_content += ''

path_name = name + '.html'

###保存为HTML文件

with open(path_name,'w',encoding='utf-8') as fp:

fp.write(all_content)

confg = pdfkit.configuration(wkhtmltopdf=r'wkhtmltopdf.exe保存的路径')

pdfkit.from_url(path_name, name+'.pdf',configuration=confg)###把HTML文件转为pdf

print('错题PDF保存成功')

###删除HTML文件

os.remove(path_name)

华图网站

也是答题记录中自己做过的题目

华图网站稍微不一样，他的数据直接抓包就可看到

通过请求这个包即可获取到数据，接下来就是解析的事情了，这次我用word文档进行存储，如果觉得不方便也可以像上文一样构造HTML

##导包

import requests

import lxml.etree

import re

import time

import os

from docx import Document

from docx.shared import Inches

from docx.shared import Pt

from docx.shared import Inches

from docx.oxml.ns import qn

from docx.enum.text import WD_ALIGN_PARAGRAPH

url = str(input("请输入练习的网址："))

headers={

###完整的headers，否则获取不到数据

}

response = requests.get(url = url,headers = headers)

response.encoding='utf-8'

reptext = response.text

tree = lxml.etree.HTML(reptext) #解析网站获取源码

dirName="考公图片"

if not os.path.exists(dirName):

os.mkdir(dirName) #网站图片保存路径

jiexi = re.findall(r'

解析.*?。 .*? ', reptext,re.S) #获取题目解析

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。