Appearance
网易社招
数据建模
python
class JobItem(scrapy.Item):
name = scrapy.Field() # 岗位名称
type = scrapy.Field() # 岗位类型 全职 兼职
city = scrapy.Field() # 工作城市
company = scrapy.Field() # 招聘企业
category = scrapy.Field() # 岗位类型
need_num = scrapy.Field() # 岗位人数
degree = scrapy.Field() # 岗位学历
experience = scrapy.Field() # 岗位经验
pub_time = scrapy.Field() # 发布时间
description = scrapy.Field() # 岗位介绍
require = scrapy.Field() # 岗位要求数据采集
python
import json
import scrapy
from wangyi.items import JobItem
class HrSpider(scrapy.Spider):
name = "hr"
allowed_domains = ["hr.163.com"]
start_urls = []
current_page = 1
page_size = 10
list_url = "https://hr.163.com/api/hr163/position/queryPage"
detail_url = "https://hr.163.com/api/hr163/position/query"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/139.0.0.0 Safari/537.36",
"Content-Type": "application/json;charset=UTF-8",
"Referer": "https://hr.163.com/job-list.html",
}
async def start(self):
data = {
"currentPage": self.current_page,
"pageSize": self.page_size,
}
yield scrapy.Request(self.list_url, headers = self.headers, method = "POST", body = json.dumps(data),
callback = self.parse)
def parse(self, response):
result = json.loads(response.text)
if result["code"] == 200:
jobs = result["data"]["list"]
pages = result["data"]["pages"]
for job in jobs:
job_item = JobItem()
job_item["name"] = job["name"]
job_item["type"] = job["workType"]
job_item["city"] = "".join(job["workPlaceNameList"])
job_item["company"] = job["productName"]
job_item["category"] = job["firstPostTypeName"]
job_item["need_num"] = job["recruitNum"]
job_item["degree"] = job["reqEducationName"]
job_item["experience"] = job["reqWorkYearsName"]
job_item["pub_time"] = job["updateTime"]
job_item["description"] = job["description"]
job_item["require"] = job["requirement"]
yield job_item
# 解析详情页
detail_url = self.detail_url + '?id=' + job["id"]
yield scrapy.Request(detail_url, callback = self.parse_detail)
# 分页获取
if self.current_page <= pages:
self.current_page += 1
data = {
"currentPage": self.current_page,
"pageSize": self.page_size,
}
yield scrapy.Request(self.list_url, headers = self.headers, method = "POST", body = json.dumps(data),
callback = self.parse)
# 解析详情页
def parse_detail(self, response):
result = json.loads(response.text)
if result["code"] == 200:
job = result["data"]
job_item = JobItem()
job_item["name"] = job["name"]
job_item["type"] = job["workType"]
job_item["city"] = "".join(job["workPlaceNameList"])
job_item["company"] = job["productName"]
job_item["category"] = job["firstPostTypeName"]
job_item["need_num"] = job["recruitNum"]
job_item["degree"] = job["reqEducationName"]
job_item["experience"] = job["reqWorkYearsName"]
job_item["pub_time"] = job["updateTime"]
job_item["description"] = job["description"]
job_item["require"] = job["requirement"]
yield job_item