900字范文 > pyspider爬虫框架之猎聘网招聘信息爬取

pyspider爬虫框架之猎聘网招聘信息爬取

时间：2021-01-08 15:54:39

需求

抓取需求

1、按地区抓取

2、抓取职位名称，薪酬，学历要求，工作年限要求，发布时间，公司名称，所属行业

代理

注释很详细，不解释了，没有代理慎用。

#!/usr/bin/env python# -*- encoding: utf-8 -*-# Created on -08-16 11:04:59# Project: hunting_recruitfrom pyspider.libs.base_handler import *import reimport datetimefrom pymongo import MongoClient# 连接线下数据库DB_NAME = 'research'DB_COL = 'hunting_recruit'db = client[DB_NAME]col = db[DB_COL]class Handler(BaseHandler):crawl_config = {"headers":{"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"},"proxy": "http://localhost:6666"}url = '/zhaopin'def format_date(self, date):return datetime.datetime.strptime(date, '%Y%m%d')@every(minutes=24 * 60)def on_start(self):self.crawl(self.url, callback=self.index_page)@config(age=60)def index_page(self, response):page = response.etreebase_url = ''##行业列表industry_list = page.xpath("//dd[@data-param='industries']/ul/li")for each in industry_list:title = each.xpath("./span/text()")[0]print('-------',title,'--------')## 子目录sub_list = each.xpath("./div[@class='sub-industry']/a")for sub in sub_list:belonging = sub.xpath("./text()")[0]print(belonging)link_url = base_url + sub.xpath("./@href")[0]save = {"belonging": belonging}self.crawl(link_url, callback=self.parse_city, save=save)@config(age=60)def parse_city(self, response):page = response.etreebase_url = ''## 城市列表city_list = page.xpath("//dd[@data-param='city']/a")[1:-1] #去掉全国不要其他for each in city_list:city = each.xpath("./text()")[0]print(city)link_url = base_url + each.xpath("./@href")[0]save = {"belonging": response.save["belonging"], "city": city}self.crawl(link_url, callback=self.parse_district, save=save)@config(age=60)def parse_district(self, response):page = response.etreebase_url = ''## 地区列表district_list = page.xpath("//dd[@data-param='dqs']/a")for each in district_list:district = each.xpath("./text()")[0]print(district)link_url = base_url + each.xpath("./@href")[0]save = {"belonging": response.save["belonging"], "city": response.save["city"], "district": district}self.crawl(link_url, callback=self.parse_detail, save=save)@config(age=60)def parse_detail(self, response):page = response.etree## 翻页tail_url = page.xpath(u"//a[@title='末页']/@href")[0]print(tail_url)page_num = int(re.findall('&curPage=(\d+)', tail_url)[0])print(page_num)for each in range(page_num):page_url = response.url + '&curPage={}'.format(each)self.crawl(page_url, callback=self.parse_page, save=response.save)def parse_page(self, response):page = response.etree## 内容列表contents = page.xpath("//ul[@class='sojob-list']/li")for each in contents:try:##职位名称position_name = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/h3/a/text()")[0].strip()print(position_name)## 薪酬salary = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='text-warning']/text()")[0]print(salary)## 学历education = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='edu']/text()")[0]print(education)## 工作经验experience = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[last()]/text()")[0]print(experience)## 发布时间public_time = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/time/@title")[0]public_time = ''.join(re.findall('\d+', public_time))print(public_time)## 公司名称company = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='company-name']/a/@title")[0]print(company)##公司所属行业company_belong = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='field-financing']/span/a/text()")[0]print(company_belong)##反馈时间time_delay = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/span/text()")[0]print(time_delay)##福利welfare = '-'.join(each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='temptation clearfix']/span/text()"))print(welfare)print('------------------------------')result = {"belonging": response.save["belonging"],"city": response.save["city"],"district": response.save["district"],"position_name": position_name,"salary": salary,"education": education,"experience": experience,"public_time": self.format_date(public_time),"company": company,"update_time": datetime.datetime.now(),"company_belong": company_belong,"time_delay": time_delay,"welfare": welfare}yield resultexcept:passdef on_result(self, result):if result is None:returnupdate_key = {'position_name': result['position_name'],'public_time': result['public_time'],'city': result['city'],'district': result['district'],'company': result['company'],'belonging': result['belonging']}col.update(update_key, {'$set': result}, upsert=True)

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。