Skip to content

网易社招

数据建模

python
class JobItem(scrapy.Item):
	name = scrapy.Field()  # 岗位名称
	type = scrapy.Field()  # 岗位类型 全职 兼职
	city = scrapy.Field()  # 工作城市
	company = scrapy.Field()  # 招聘企业

	category = scrapy.Field()  # 岗位类型
	need_num = scrapy.Field()  # 岗位人数
	degree = scrapy.Field()  # 岗位学历
	experience = scrapy.Field()  # 岗位经验

	pub_time = scrapy.Field()  # 发布时间
	description = scrapy.Field()  # 岗位介绍
	require = scrapy.Field()  # 岗位要求

数据采集

python
import json

import scrapy

from wangyi.items import JobItem

class HrSpider(scrapy.Spider):
	name = "hr"
	allowed_domains = ["hr.163.com"]
	start_urls = []

	current_page = 1
	page_size = 10
	list_url = "https://hr.163.com/api/hr163/position/queryPage"
	detail_url = "https://hr.163.com/api/hr163/position/query"

	headers = {
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
		              "Chrome/139.0.0.0 Safari/537.36",
		"Content-Type": "application/json;charset=UTF-8",
		"Referer": "https://hr.163.com/job-list.html",
	}

	async def start(self):
		data = {
			"currentPage": self.current_page,
			"pageSize": self.page_size,
		}
		yield scrapy.Request(self.list_url, headers = self.headers, method = "POST", body = json.dumps(data),
		                     callback = self.parse)

	def parse(self, response):
		result = json.loads(response.text)

		if result["code"] == 200:
			jobs = result["data"]["list"]
			pages = result["data"]["pages"]
			for job in jobs:
				job_item = JobItem()
				job_item["name"] = job["name"]
				job_item["type"] = job["workType"]
				job_item["city"] = "".join(job["workPlaceNameList"])
				job_item["company"] = job["productName"]
				job_item["category"] = job["firstPostTypeName"]
				job_item["need_num"] = job["recruitNum"]
				job_item["degree"] = job["reqEducationName"]
				job_item["experience"] = job["reqWorkYearsName"]

				job_item["pub_time"] = job["updateTime"]
				job_item["description"] = job["description"]
				job_item["require"] = job["requirement"]
				yield job_item
				# 解析详情页
				detail_url = self.detail_url + '?id=' + job["id"]
				yield scrapy.Request(detail_url, callback = self.parse_detail)

			# 分页获取
			if self.current_page <= pages:
				self.current_page += 1
				data = {
					"currentPage": self.current_page,
					"pageSize": self.page_size,
				}
				yield scrapy.Request(self.list_url, headers = self.headers, method = "POST", body = json.dumps(data),
				                     callback = self.parse)

	# 解析详情页
	def parse_detail(self, response):
		result = json.loads(response.text)
		if result["code"] == 200:
			job = result["data"]
			job_item = JobItem()
			job_item["name"] = job["name"]
			job_item["type"] = job["workType"]
			job_item["city"] = "".join(job["workPlaceNameList"])
			job_item["company"] = job["productName"]
			job_item["category"] = job["firstPostTypeName"]
			job_item["need_num"] = job["recruitNum"]
			job_item["degree"] = job["reqEducationName"]
			job_item["experience"] = job["reqWorkYearsName"]

			job_item["pub_time"] = job["updateTime"]
			job_item["description"] = job["description"]
			job_item["require"] = job["requirement"]
			yield job_item