Python 求職 Top10 城市,來看看是否有你所在的城市
(點擊
上方藍字
,快速關注我們)
來源: lemon 的自薦投稿(微信公號:Python數據之道)
如有好文章投稿,請點擊 → 這裡了解詳情
前言
從智聯招聘爬取相關信息後,我們關心的是如何對內容進行分析,獲取用用的信息。
本次以上篇文章「5分鐘掌握智聯招聘網站爬取並保存到MongoDB資料庫」中爬取的數據為基礎,分析關鍵詞為「python」的爬取數據的情況,獲取包括全國python招聘數量Top10的城市列表以及其他相關信息。
一、主要分析步驟
數據讀取
數據整理
對職位數量在全國主要城市的分布情況進行分析
對全國範圍內的職位月薪情況進行分析
對該職位招聘崗位要求描述進行詞雲圖分析,獲取頻率最高的關鍵字
選取兩個城市,分別分析月薪分布情況以及招聘要求的詞雲圖分析
二、具體分析過程
import pymongo
import pandas
as
pd
import
matplotlib
.
pyplot
as
plt
import
numpy
as
np
%
matplotlib
inline
plt
.
style
.
use
(
"ggplot"
)
# 解決matplotlib顯示中文問題
plt
.
rcParams
[
"font.sans-serif"
]
=
[
"SimHei"
]
# 指定默認字體
plt
.
rcParams
[
"axes.unicode_minus"
]
=
False
# 解決保存圖像是負號"-"顯示為方塊的問題
1 讀取數據
client
=
pymongo
.
MongoClient
(
"localhost"
)
db
=
client
[
"zhilian"
]
table
=
db
[
"python"
]
columns
=
[
"zwmc"
,
"gsmc"
,
"zwyx"
,
"gbsj"
,
"gzdd"
,
"fkl"
,
"brief"
,
"zw_link"
,
"_id"
,
"save_date"
]
# url_set = set([records["zw_link"] for records in table.find()])
# print(url_set)
df
=
pd
.
DataFrame
([
records
for
records
in
table
.
find
()],
columns
=
columns
)
# columns_update = ["職位名稱",
# "公司名稱",
# "職位月薪",
# "公布時間",
# "工作地點",
# "反饋率",
# "招聘簡介",
# "網頁鏈接",
# "_id",
# "信息保存日期"]
# df.columns = columns_update
(
"總行數為:{}行"
.
format
(
df
.
shape
[
0
]))
df
.
head
(
2
)
結果如圖1所示:
2 數據整理
2.1 將str格式的日期變為 datatime
df
[
"save_date"
]
=
pd
.
to_datetime
(
df
[
"save_date"
])
(
df
[
"save_date"
].
dtype
)
# df["save_date"]
datetime64[ns]
2.2 篩選月薪格式為「XXXX-XXXX」的信息
df_clean
=
df
[[
"zwmc"
,
"gsmc"
,
"zwyx"
,
"gbsj"
,
"gzdd"
,
"fkl"
,
"brief"
,
"zw_link"
,
"save_date"
]]
# 對月薪的數據進行篩選,選取格式為「XXXX-XXXX」的信息,方面後續分析
df_clean
=
df_clean
[
df_clean
[
"zwyx"
].
str
.
contains
(
"d+-d+"
,
regex
=
True
)]
(
"總行數為:{}行"
.
format
(
df_clean
.
shape
[
0
]))
# df_clean.head()
總行數為:22605行
2.3 分割月薪欄位,分別獲取月薪的下限值和上限值
# http://stackoverflow.com/questions/14745022/pandas-dataframe-how-do-i-split-a-column-into-two
# http://stackoverflow.com/questions/20602947/append-column-to-pandas-dataframe
# df_temp.loc[: ,"zwyx_min"],df_temp.loc[: , "zwyx_max"] = df_temp.loc[: , "zwyx"].str.split("-",1).str #會有警告
s_min
,
s_max
=
df_clean
.
loc
[
:
,
"zwyx"
].
str
.
split
(
"-"
,
1
).
str
df_min
=
pd
.
DataFrame
(
s_min
)
df_min
.
columns
=
[
"zwyx_min"
]
df_max
=
pd
.
DataFrame
(
s_max
)
df_max
.
columns
=
[
"zwyx_max"
]
df_clean_concat
=
pd
.
concat
([
df_clean
,
df_min
,
df_max
],
axis
=
1
)
# df_clean["zwyx_min"].astype(int)
df_clean_concat
[
"zwyx_min"
]
=
pd
.
to_numeric
(
df_clean_concat
[
"zwyx_min"
])
df_clean_concat
[
"zwyx_max"
]
=
pd
.
to_numeric
(
df_clean_concat
[
"zwyx_max"
])
# print(df_clean["zwyx_min"].dtype)
(
df_clean_concat
.
dtypes
)
df_clean_concat
.
head
(
2
)
運行結果如圖2所示:
將數據信息按職位月薪進行排序
df_clean_concat
.
sort_values
(
"zwyx_min"
,
inplace
=
True
)
# df_clean_concat.tail()
判斷爬取的數據是否有重複值
# 判斷爬取的數據是否有重複值
(
df_clean_concat
[
df_clean_concat
.
duplicated
(
"zw_link"
)
==
True
])
Empty DataFrame
Columns
:
[
zwmc
,
gsmc
,
zwyx
,
gbsj
,
gzdd
,
fkl
,
brief
,
zw_link
,
save_date
,
zwyx_min
,
zwyx_max
]
Index
:
[]
從上述結果可看出,數據是沒有重複的。
3 對全國範圍內的職位進行分析
3.1 主要城市的招聘職位數量分布情況
# from IPython.core.display import display, HTML
ADDRESS
=
[
"北京"
,
"上海"
,
"廣州"
,
"深圳"
,
"天津"
,
"武漢"
,
"西安"
,
"成都"
,
"大連"
,
"長春"
,
"瀋陽"
,
"南京"
,
"濟南"
,
"青島"
,
"杭州"
,
"蘇州"
,
"無錫"
,
"寧波"
,
"重慶"
,
"鄭州"
,
"長沙"
,
"福州"
,
"廈門"
,
"哈爾濱"
,
"石家莊"
,
"合肥"
,
"惠州"
,
"太原"
,
"昆明"
,
"煙台"
,
"佛山"
,
"南昌"
,
"貴陽"
,
"南寧"
]
df_city
=
df_clean_concat
.
copy
()
# 由於工作地點的寫上,比如北京,包含許多地址為北京-朝陽區等
# 可以用替換的方式進行整理,這裡用pandas的replace()方法
for
city
in
ADDRESS
:
df_city
[
"gzdd"
]
=
df_city
[
"gzdd"
].
replace
([(
city
+
".*"
)],[
city
],
regex
=
True
)
# 針對全國主要城市進行分析
df_city_main
=
df_city
[
df_city
[
"gzdd"
].
isin
(
ADDRESS
)]
df_city_main_count
=
df_city_main
.
groupby
(
"gzdd"
)[
"zwmc"
,
"gsmc"
].
count
()
df_city_main_count
[
"gsmc"
]
=
df_city_main_count
[
"gsmc"
]
/
(
df_city_main_count
[
"gsmc"
].
sum
())
df_city_main_count
.
columns
=
[
"number"
,
"percentage"
]
# 按職位數量進行排序
df_city_main_count
.
sort_values
(
by
=
"number"
,
ascending
=
False
,
inplace
=
True
)
# 添加輔助列,標註城市和百分比,方面在後續繪圖時使用
df_city_main_count
[
"label"
]
=
df_city_main_count
.
index
+
" "
+
((
df_city_main_count
[
"percentage"
]
*
100
).
round
()).
astype
(
"int"
).
astype
(
"str"
)
+
"%"
(
type
(
df_city_main_count
))
# 職位數量最多的Top10城市的列表
(
df_city_main_count
.
head
(
10
))
<
class
"pandas.core.frame.DataFrame"
>
number percentage label
gzdd
北京
6936
0.315948
北京
32
%
上海
3213
0.146358
上海
15
%
深圳
1908
0.086913
深圳
9
%
成都
1290
0.058762
成都
6
%
杭州
1174
0.053478
杭州
5
%
廣州
1167
0.053159
廣州
5
%
南京
826
0.037626
南京
4
%
鄭州
741
0.033754
鄭州
3
%
武漢
552
0.025145
武漢
3
%
西安
473
0.021546
西安
2
%
對結果進行繪圖:
from matplotlib import cm
label
=
df_city_main_count
[
"label"
]
sizes
=
df_city_main_count
[
"number"
]
# 設置繪圖區域大小
fig
,
axes
=
plt
.
subplots
(
figsize
=
(
10
,
6
),
ncols
=
2
)
ax1
,
ax2
=
axes
.
ravel
()
colors
=
cm
.
PiYG
(
np
.
arange
(
len
(
sizes
))
/
len
(
sizes
))
# colormaps: Paired, autumn, rainbow, gray,spring,Darks
# 由於城市數量太多,餅圖中不顯示labels和百分比
patches
,
texts
=
ax1
.
pie
(
sizes
,
labels
=
None
,
shadow
=
False
,
startangle
=
0
,
colors
=
colors
)
ax1
.
axis
(
"equal"
)
ax1
.
set_title
(
"職位數量分布"
,
loc
=
"center"
)
# ax2 只顯示圖例(legend)
ax2
.
axis
(
"off"
)
ax2
.
legend
(
patches
,
label
,
loc
=
"center left"
,
fontsize
=
9
)
plt
.
savefig
(
"job_distribute.jpg"
)
plt
.
show
()
運行結果如下述餅圖所示:
3.2 月薪分布情況(全國)
from
matplotlib
.
ticker import FormatStrFormatter
fig
,
(
ax1
,
ax2
)
=
plt
.
subplots
(
figsize
=
(
10
,
8
),
nrows
=
2
)
x_pos
=
list
(
range
(
df_clean_concat
.
shape
[
0
]))
y1
=
df_clean_concat
[
"zwyx_min"
]
ax1
.
plot
(
x_pos
,
y1
)
ax1
.
set_title
(
"Trend of min monthly salary in China"
,
size
=
14
)
ax1
.
set_xticklabels
(
""
)
ax1
.
set_ylabel
(
"min monthly salary(RMB)"
)
bins
=
[
3000
,
6000
,
9000
,
12000
,
15000
,
18000
,
21000
,
24000
,
100000
]
counts
,
bins
,
patches
=
ax2
.
hist
(
y1
,
bins
,
normed
=
1
,
histtype
=
"bar"
,
facecolor
=
"g"
,
rwidth
=
0.8
)
ax2
.
set_title
(
"Hist of min monthly salary in China"
,
size
=
14
)
ax2
.
set_yticklabels
(
""
)
# ax2.set_xlabel("min monthly salary(RMB)")
# http://stackoverflow.com/questions/6352740/matplotlib-label-each-bin
ax2
.
set_xticks
(
bins
)
#將bins設置為xticks
ax2
.
set_xticklabels
(
bins
,
rotation
=-
90
)
# 設置為xticklabels的方向
# Label the raw counts and the percentages below the x-axis...
bin_centers
=
0.5
*
np
.
diff
(
bins
)
+
bins
[
:-
1
]
for
count
,
x
in
zip
(
counts
,
bin_centers
)
:
# # Label the raw counts
# ax2.annotate(str(count), xy=(x, 0), xycoords=("data", "axes fraction"),
# xytext=(0, -70), textcoords="offset points", va="top", ha="center", rotation=-90)
# Label the percentages
percent
=
"%0.0f%%"
%
(
100
*
float
(
count
)
/
counts
.
sum
())
ax2
.
annotate
(
percent
,
xy
=
(
x
,
0
),
xycoords
=
(
"data"
,
"axes fraction"
),
xytext
=
(
0
,
-
40
),
textcoords
=
"offset points"
,
va
=
"top"
,
ha
=
"center"
,
rotation
=-
90
,
color
=
"b"
,
size
=
14
)
fig
.
savefig
(
"salary_quanguo_min.jpg"
)
運行結果如下述圖所示:
不考慮部分極值後,分析月薪分布情況
df_zwyx_adjust
=
df_clean_concat
[
df_clean_concat
[
"zwyx_min"
]
<=
20000
]
fig
,
(
ax1
,
ax2
)
=
plt
.
subplots
(
figsize
=
(
10
,
8
),
nrows
=
2
)
x_pos
=
list
(
range
(
df_zwyx_adjust
.
shape
[
0
]))
y1
=
df_zwyx_adjust
[
"zwyx_min"
]
ax1
.
plot
(
x_pos
,
y1
)
ax1
.
set_title
(
"Trend of min monthly salary in China (adjust)"
,
size
=
14
)
ax1
.
set_xticklabels
(
""
)
ax1
.
set_ylabel
(
"min monthly salary(RMB)"
)
bins
=
[
3000
,
6000
,
9000
,
12000
,
15000
,
18000
,
21000
]
counts
,
bins
,
patches
=
ax2
.
hist
(
y1
,
bins
,
normed
=
1
,
histtype
=
"bar"
,
facecolor
=
"g"
,
rwidth
=
0.8
)
ax2
.
set_title
(
"Hist of min monthly salary in China (adjust)"
,
size
=
14
)
ax2
.
set_yticklabels
(
""
)
# ax2.set_xlabel("min monthly salary(RMB)")
# http://stackoverflow.com/questions/6352740/matplotlib-label-each-bin
ax2
.
set_xticks
(
bins
)
#將bins設置為xticks
ax2
.
set_xticklabels
(
bins
,
rotation
=-
90
)
# 設置為xticklabels的方向
# Label the raw counts and the percentages below the x-axis...
bin_centers
=
0.5
*
np
.
diff
(
bins
)
+
bins
[
:-
1
]
for
count
,
x
in
zip
(
counts
,
bin_centers
)
:
# # Label the raw counts
# ax2.annotate(str(count), xy=(x, 0), xycoords=("data", "axes fraction"),
# xytext=(0, -70), textcoords="offset points", va="top", ha="center", rotation=-90)
# Label the percentages
percent
=
"%0.0f%%"
%
(
100
*
float
(
count
)
/
counts
.
sum
())
ax2
.
annotate
(
percent
,
xy
=
(
x
,
0
),
xycoords
=
(
"data"
,
"axes fraction"
),
xytext
=
(
0
,
-
40
),
textcoords
=
"offset points"
,
va
=
"top"
,
ha
=
"center"
,
rotation
=-
90
,
color
=
"b"
,
size
=
14
)
fig
.
savefig
(
"salary_quanguo_min_adjust.jpg"
)
運行結果如下述圖所示:
3.3 相關技能要求
brief_list
=
list
(
df_clean_concat
[
"brief"
])
brief_str
=
""
.
join
(
brief_list
)
(
type
(
brief_str
))
# print(brief_str)
# with open("brief_quanguo.txt", "w", encoding="utf-8") as f:
# f.write(brief_str)
<class "str">
對獲取到的職位招聘要求進行詞雲圖分析,代碼如下:
# -*- coding: utf-8 -*-
"""
Created on Wed May 17 2017
@author: lemon
"""
import jieba
from wordcloud import
WordCloud
,
ImageColorGenerator
import
matplotlib
.
pyplot
as
plt
import os
import
PIL
.
Image
as
Image
import numpy
as
np
with open
(
"brief_quanguo.txt"
,
"rb"
)
as
f
:
# 讀取文件內容
text
=
f
.
read
()
f
.
close
()
# 首先使用 jieba 中文分詞工具進行分詞
wordlist
=
jieba
.
cut
(
text
,
cut_all
=
False
)
# cut_all, True為全模式,False為精確模式
wordlist_space_split
=
" "
.
join
(
wordlist
)
d
=
os
.
path
.
dirname
(
__file__
)
alice_coloring
=
np
.
array
(
Image
.
open
(
os
.
path
.
join
(
d
,
"colors.png"
)))
my_wordcloud
=
WordCloud
(
background_color
=
"#F0F8FF"
,
max_words
=
100
,
mask
=
alice_coloring
,
max_font_size
=
300
,
random_state
=
42
).
generate
(
wordlist_space_split
)
image_colors
=
ImageColorGenerator
(
alice_coloring
)
plt
.
show
(
my_wordcloud
.
recolor
(
color_func
=
image_colors
))
plt
.
imshow
(
my_wordcloud
)
# 以圖片的形式顯示詞雲
plt
.
axis
(
"off"
)
# 關閉坐標軸
plt
.
show
()
my_wordcloud
.
to_file
(
os
.
path
.
join
(
d
,
"brief_quanguo_colors_cloud.png"
))
得到結果如下:
4 北京
4.1 月薪分布情況
df_beijing
=
df_clean_concat
[
df_clean_concat
[
"gzdd"
].
str
.
contains
(
"北京.*"
,
regex
=
True
)]
df_beijing
.
to_excel
(
"zhilian_kw_python_bj.xlsx"
)
(
"總行數為:{}行"
.
format
(
df_beijing
.
shape
[
0
]))
# df_beijing.head()
總行數為:6936行
參考全國分析時的代碼,月薪分布情況圖如下:
4.2 相關技能要求
brief_list_bj
=
list
(
df_beijing
[
"brief"
])
brief_str_bj
=
""
.
join
(
brief_list_bj
)
(
type
(
brief_str_bj
))
# print(brief_str_bj)
# with open("brief_beijing.txt", "w", encoding="utf-8") as f:
# f.write(brief_str_bj)
<class "str">
詞雲圖如下:
5 長沙
5.1 月薪分布情況
df_changsha
=
df_clean_concat
[
df_clean_concat
[
"gzdd"
].
str
.
contains
(
"長沙.*"
,
regex
=
True
)]
# df_changsha = pd.DataFrame(df_changsha, ignore_index=True)
df_changsha
.
to_excel
(
"zhilian_kw_python_cs.xlsx"
)
(
"總行數為:{}行"
.
format
(
df_changsha
.
shape
[
0
]))
# df_changsha.tail()
總行數為:280行
參考全國分析時的代碼,月薪分布情況圖如下:
5.2 相關技能要求
brief_list_cs
=
list
(
df_changsha
[
"brief"
])
brief_str_cs
=
""
.
join
(
brief_list_cs
)
(
type
(
brief_str_cs
))
# print(brief_str_cs)
# with open("brief_changsha.txt", "w", encoding="utf-8") as f:
# f.write(brief_str_cs)
<class "str">
詞雲圖如下:
看完本文有收穫?請轉
發分享給更多人
關注「P
ython開發者」,提升Python技能
![](https://pic.pimg.tw/zzuyanan/1488615166-1259157397.png)
![](https://pic.pimg.tw/zzuyanan/1482887990-2595557020.jpg)
※機器學習演算法實踐:樸素貝葉斯 (Naive Bayes)
※這2個套路走完, 你就成了 Facebook 認證的數據分析師
※動手實現推薦系統,挑戰高薪!
※5 分鐘掌握智聯招聘網站爬取並保存到 MongoDB 資料庫
※全棧開發者都應該關注這些
TAG:Python開發者 |
※新男團Nine Percent全國巡迴粉絲見面會,看看有沒有你的城市
※「Pop-up store」in 國大城市廣場
※Alphabet 旗下的 Sidewalk Labs 成立了一家子公司,想用大數據讓城市美好
※6大城市更新!AJ 1 OG WMNS Satin 「Shattered Backboard」抽籤開啟
※NBA 城市主題!多款 Air Force 1 iD 「City Edition」 現已上架
※6大城市支持使用Apple Watch3
※Cities:Skylines–Concerts for Mac 1.8.0 現代城市模擬遊戲 含中文
※Aussie rogue-lite城市的Brass眼睛注視著Early
※Breaking2一周年紀念NIKE又在憋大招?Zoom Fly SP城市系列首次完整曝光!
※Python爬蟲系列:使用selenium+Edge查詢指定城市天氣情況
※它被Lonely Planet評為年度最佳旅行城市,在這兒,跳舞喝酒才是正事
※Swaine Adeney Brigg | 在這座被雨困住的城市,邂逅「雨傘中的勞斯萊斯」
※SOUL.D送你一本Urban Style穿搭指南,喚醒沉寂一冬的城市街頭
※「最適合比特幣的城市」Arnhem Bitcoinstad網站不再接受比特幣支付
※城市之光,Ivo van de Grift插畫作品
※Google Pay現在可以處理城市交通票支付
※蟬翼鞋面顏值不俗!Nike Zoom Fly SP 帶來城市系列
※歐洲城市游秘密檔案 https://www.dolc.de
※《城市:天際線》4月加入Xbox Game Pass
※插畫:快樂行走,有你的城市我不寂寞-韓國yalzza