jlzzjlzz亚洲乱熟在线播放

系統城裝機大師 - 唯一官網:www.farandoo.com!

當前位置:首頁 > 腳本中心 > python > 詳細頁面

python使用bs4爬取boss直聘靜態頁面

時間:2020-10-11來源:www.farandoo.com作者:電腦系統城

思路:

  1、將需要查詢城市列表,通過城市接口轉換成相應的code碼

  2、遍歷城市、職位生成url

  3、通過url獲取列表頁面信息,遍歷列表頁面信息

  4、再根據列表頁面信息的job_link獲取詳情頁面信息,將需要的信息以字典data的形式存在列表datas里  

  5、判斷列表頁面是否有下一頁,重復步驟3、4;同時將列表datas一直傳遞下去

  6、一個城市、職位url爬取完后,將列表datas接在列表datas_list后面,重復3、4、5

  7、最后將列表datas_list的數據,遍歷寫在Excel里面

知識點:

  1、將response內容以json形式輸出,解析json并取值

  2、soup 的select()和find_all()和find()方法使用

  3、異常Exception的使用

  4、wldt創建編輯Excel的使用

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests, time, xlwt
from bs4 import BeautifulSoup
 
class MyJob():
  def __init__(self, mycity, myquery):
    self.city = mycity
    self.query = myquery
    self.list_url = "https://www.zhipin.com/job_detail/?query=%s&city=%s&industry=&position="%(self.query, self.city)
    self.datas = []
    self.header = {
      'authority': 'www.zhipin.com',
      'method': 'GET',
      'scheme': 'https',
      'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'accept-encoding': 'gzip, deflate, br',
      'accept-language': 'zh-CN,zh;q=0.9',
      'cache-control': 'max-age=0',
      'cookie': 'lastCity=101210100;uab_collina=154408714637849548916323;toUrl=/;c=1558272251;g=-;l=l=%2Fwww.zhipin.com%2Fuser%2Flogin.html&r=; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1555852331,1556985726,1558169427,1558272251; __a=40505844.1544087205.1558169426.1558272251.41.14.4.31; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1558272385',
      'referer': 'https://www.zhipin.com/?ka=header-logo',
      'upgrade-insecure-requests': '1',
      'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
 
  #將城市轉化為code碼
  def get_city(self,city_list):
    city_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json" #獲取城市
    json = requests.get(city_url).json()
    zpData = json["zpData"]["cityList"]
    list = []
    for city in city_list :
      for data_sf in zpData:
        for data_dq in data_sf["subLevelModelList"]:
          if city == data_dq["name"]:
             list.append(data_dq["code"])
    return list
 
  #獲取所有頁內容
  def get_job_list(self, url, datas):
    print(url)
    html = requests.get(url, headers=self.header).text
    soup = BeautifulSoup(html, 'html.parser')
    jobs = soup.select(".job-primary")
    for job in jobs:
      data = {}
      # 招聘id
      data["job_id"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("data-jobid")
      # 招聘鏈接
      data["job_link"] = "https://www.zhipin.com" + job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("href")
      # 招聘崗位
      data["job_name"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-title"}).get_text()
      # 薪資
      data["job_red"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "red"}).get_text()
      # 地址 #工作年限 #學歷
      data["job_address"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text().split(" ")
      # 企業鏈接
      data["job_company_link"] = job.find_all("div", attrs={"class": "info-company"})[0].find("a").get("href")
      # 企業信息
      data["job_company"] = job.find_all("div", attrs={"class": "info-company"})[0].find("p").get_text().split(" ")
      # boss鏈接
      data["job_publis_link"] = job.find_all("div", attrs={"class": "info-publis"})[0].find("img").get("src")
      # boos信息
      data["job_publis"] = job.find_all("div", attrs={"class": "info-publis"})[0].find("h3").get_text().split(" ")
      time.sleep(5)
      self.get_job_detail(data) # 獲取job詳情頁內容
      print(data)
      datas.append(data) # 將某條job添加到datas中,直到將當前頁添加完
 
    try:
      next_url = soup.find("div", attrs={"class": "page"}).find("a", attrs={"class": "next"}).get("href")
      #if next_url[-1] =="3": # 第二頁自動拋異常
      if next_url in "javascript:;": # 最后一頁自動拋異常
        raise Exception()
    except Exception as e:
      print("最后一頁了;%s" % e)
      return datas # 返回所有頁內容
    else:
      time.sleep(5)
      next_url = "https://www.zhipin.com" + next_url
      self.get_job_list(next_url, datas)
      return datas # 返回所有頁內容
 
  #獲取詳情頁內容
  def get_job_detail(self, data):
    print(data["job_link"])
    html = requests.get(data["job_link"], headers=self.header).text
    soup = BeautifulSoup(html, 'html.parser')
    # 招聘公司
    data["detail_content_name"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "name"}).get_text()
    # 福利
    data["detail_primary_tags"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-tags"}).get_text().strip()
    # 招聘崗位
    data["detail_primary_name"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("h1").get_text()
    # 招聘狀態
    data["detail_primary_status"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-status"}).get_text()
    # 薪資
    data["detail_primary_salary"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "salary"}).get_text()
    # 地址 #工作年限 #學歷
    data["detail_primary_address"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text()
    # 工作地址
    data["detail_content_address"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "location-address"}).get_text()
    # 職位描述
    data["detail_content_text"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "text"}).get_text().strip().replace(";", "\n")
    # boss名字
    data["detail_op_name"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("h2", attrs={"class": "name"}).get_text()
    # boss職位
    data["detail_op_job"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[0]
    # boss狀態
    data["detail_op_status"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[1]
 
  #將獲取的數據寫入Excel
  def setExcel(self, datas_list):
    book = xlwt.Workbook(encoding='utf-8')
    table = book.add_sheet("boss軟件測試")
    table.write(0, 0, "編號")
    table.write(0, 1, "招聘鏈接")
    table.write(0, 2, "招聘崗位")
    table.write(0, 3, "薪資")
    table.write(0, 4, "地址")
    table.write(0, 5, "企業鏈接")
    table.write(0, 6, "企業信息")
    table.write(0, 7, "boss鏈接")
    table.write(0, 8, "boss信息")
    table.write(0, 9, "detail詳情")
    i = 1
    for data in datas_list:
      table.write(i, 0, data["job_id"])
      table.write(i, 1, data["job_link"])
      table.write(i, 2, data["job_name"])
      table.write(i, 3, data["job_red"])
      table.write(i, 4, data["job_address"])
      table.write(i, 5, data["job_company_link"])
      table.write(i, 6, data["job_company"])
      table.write(i, 7, data["job_publis_link"])
      table.write(i, 8, data["job_publis"])
 
      table.write(i, 10, data["detail_content_name"])
      table.write(i, 11, data["detail_primary_name"])
      table.write(i, 12, data["detail_primary_status"])
      table.write(i, 13, data["detail_primary_salary"])
      table.write(i, 14, data["detail_primary_address"])
      table.write(i, 15, data["detail_content_text"])
      table.write(i, 16, data["detail_op_name"])
      table.write(i, 17, data["detail_op_job"])
      table.write(i, 18, data["detail_op_status"])
      table.write(i, 19, data["detail_primary_tags"])
      table.write(i, 20, data["detail_content_address"])
      i += 1
    book.save(r'C:\%s_boss軟件測試.xls' % time.strftime('%Y%m%d%H%M%S'))
    print("Excel保存成功")
 
if __name__ == '__main__':
  city_list = MyJob("","").get_city(["杭州"])
  query_list = ["軟件測試", "測試工程師"]
  datas_list = []
  for city in city_list:
    for query in query_list:
      myjob = MyJob(city, query)
      datas = myjob.get_job_list(myjob.list_url, myjob.datas)
      datas_list.extend(datas)
  myjob.setExcel(datas_list)

以上就是python使用bs4爬取boss直聘靜態頁面的詳細內容,

分享到:

相關信息

系統教程欄目

欄目熱門教程

人氣教程排行

站長推薦

熱門系統下載