liangyu Blog

python岗位信息数据可视化

Word count: 1.9kReading time: 9 min
2020/06/15 Share

唉,大厂实习难进啊。来分析下招聘网站(拉勾网)的Python岗位招聘情况,通过爬取岗位相关的地区分布、薪资水平、职位要求等数据保存到数据库,并将爬取出来的数据生成图表可视化。

源码已上传至Github

开发环境

win10、python3.7及相关第三方库

抓包工具:Fiddler

数据库:MySQL

IDE:Pycharm

开发流程

一:数据抓取

Requests抓取岗位信息:

1:使用session保存cookies信息


    def __init__(self):
        self.lagou_session = requests.session()
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        }
        self.city_list = ""
        

2:获取全国城市列表的方法


    def handle_city(self):
        city_search = re.compile(r'www\.lagou\.com\/.*\/">(.*?)')
        city_url = "https://www.lagou.com/jobs/allCity.html"
        city_result = self.handle_request(method="GET",url=city_url)
        #使用正则表达式获取城市列表
        self.city_list = set(city_search.findall(city_result))
        self.lagou_session.cookies.clear()

3:解决服务器返回操作频繁


        if '频繁' in response.text:
            print('频繁')
            #需要先清除cookies信息
            self.lagou_session.cookies.clear()
            #重新获取cookies信息
            first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" %info
            self.handle_request(method="GET", url=first_request_url)
            time.sleep(10)
            continue

4:使用多进程加速抓取


    pool = multiprocessing.Pool(2)
    for city in lagou.city_list:
        pool.apply_async(lagou.handle_city_job,args=(city,))
    pool.close()
    pool.join()

二:数据存储

1:数据表设计


#创建数据库的连接
engine = create_engine("mysql+pymysql://root:root@localhost:3306/lagou?charset=utf8")
#操作数据库,需要我们创建一个session
Session = sessionmaker(bind=engine)

#申明一个基类
Base = declarative_base()

class Lagoutables(Base):

#表名称
__tablename__ = 'lagou_data'
#id,设置为主键和自动增长
id = Column(Integer,primary_key=True,autoincrement=True)
#岗位ID,非空字段
positionId = Column(Integer,nullable=True)
# 经度
longitude = Column(Float, nullable=False)
# 纬度
latitude = Column(Float, nullable=False)
# 岗位名称
positionName = Column(String(length=50), nullable=False)
# 工作年限
workYear = Column(String(length=20), nullable=False)
# 学历
education = Column(String(length=20), nullable=False)
# 岗位性质
jobNature = Column(String(length=20), nullable=True)
# 公司类型
financeStage = Column(String(length=30), nullable=True)
# 公司规模
companySize = Column(String(length=30), nullable=True)
# 业务方向
industryField = Column(String(length=30), nullable=True)
# 所在城市
city = Column(String(length=10), nullable=False)
# 岗位标签
positionAdvantage = Column(String(length=200), nullable=True)
# 公司简称
companyShortName = Column(String(length=50), nullable=True)
# 公司全称
companyFullName = Column(String(length=200), nullable=True)
# 公司所在区
district = Column(String(length=20), nullable=True)
# 公司福利标签
companyLabelList = Column(String(length=200), nullable=True)
# 工资
salary = Column(String(length=20), nullable=False)
# 抓取日期
crawl_date = Column(String(length=20), nullable=False)

2:保存数据到MySQL


class HandleLagouData(object):
    def __init__(self):
        # 实例化session信息
        self.mysql_session = Session()
        self.date = time.strftime("%Y-%m-%d",time.localtime())

def insert_item(self, item):
    date = time.strftime("%Y-%m-%d", time.localtime())
    #存储的数据结构
    data = Lagoutables(
        #岗位ID
        positionId=item['positionId'],
        # 经度
        longitude=item['longitude'],
        # 纬度
        latitude=item['latitude'],
        # 岗位名称
        positionName=item['positionName'],
        # 工作年限
        workYear=item['workYear'],
        # 学历
        education=item['education'],
        # 岗位性质
        jobNature=item['jobNature'],
        # 公司类型
        financeStage=item['financeStage'],
        # 公司规模
        companySize=item['companySize'],
        # 业务方向
        industryField=item['industryField'],
        # 所在城市
        city=item['city'],
        # 岗位标签
        positionAdvantage=item['positionAdvantage'],
        # 公司简称
        companyShortName=item['companyShortName'],
        # 公司全称
        companyFullName=item['companyFullName'],
        # 公司所在区
        district=item['district'],
        # 公司福利标签
        companyLabelList=','.join(item['companyLabelList']),
        #工资
        salary=item['salary'],
        # 抓取日期
        crawl_date=date
    )
    # 在存储数据之前,先查询表中是否有这条岗位信息
    query_result = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date == date,
                                                                Lagoutables.positionId == item['positionId']).first()
    if query_result:
        pass
        #print('该岗位信息已存在%s:%s:%s' %(item['positionId'], item['city'], item['positionName']))
    else:
        self.mysql_session.add(data)
        self.mysql_session.commit()
        #print('新增岗位信息%s' %item['positionId'])

3:查询并过滤MySQL数据


    #行业信息
    def query_industryfield_result(self):
        info = {}
        # 查询今日抓取到的行业信息数据
        result = self.mysql_session.query(Lagoutables.industryField).filter(
            Lagoutables.crawl_date==self.date
        ).all()
        result_list1 = [x[0].split(',')[0] for x in result]
        result_list2 = [x for x in Counter(result_list1).items() if x[1]>25]
        data = [{"name":x[0],"value":x[1]} for x in result_list2]
        name_list = [name['name'] for name in data]
        info['x_name'] = name_list
        info['data'] = data
        return info

# 查询薪资情况
def query_salary_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.salary).filter(Lagoutables.crawl_date == self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items() if x[1]>20]
    result = [{"name": x[0], "value": x[1]} for x in result_list2]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info

# 查询工作年限情况
def query_workyear_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.workYear).filter(Lagoutables.crawl_date==self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items()]
    result = [{"name": x[0], "value": x[1]} for x in result_list2 if x[1]>15]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info

# 查询学历信息
def query_education_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.education).filter(Lagoutables.crawl_date==self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items()]
    result = [{"name": x[0], "value": x[1]} for x in result_list2]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info

# 岗位发布数量,折线图
def query_job_result(self):
    info = {}
    result = self.mysql_session.query(Lagoutables.crawl_date,func.count('*').label('c')).group_by(Lagoutables.crawl_date).all()
    result1 = [{"name": x[0], "value": x[1]} for x in result]
    name_list = [name['name'] for name in result1]
    info['x_name'] = name_list
    info['data'] = result1
    return info

# 根据城市计数
def query_city_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.city,func.count('*').label('c')).filter(Lagoutables.crawl_date==self.date).group_by(Lagoutables.city).all()
    result1 = [{"name": x[0], "value": x[1]} for x in result]
    name_list = [name['name'] for name in result1]
    info['x_name'] = name_list
    info['data'] = result1
    return info

#融资情况
def query_financestage_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.financeStage).filter(Lagoutables.crawl_date == self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items()]
    result = [{"name": x[0], "value": x[1]} for x in result_list2]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info

# 公司规模
def query_companysize_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.companySize).filter(Lagoutables.crawl_date == self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items()]
    result = [{"name": x[0], "value": x[1]} for x in result_list2]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info


# 任职情况
def query_jobNature_result(self):
    info = {}
    # 查询今日抓取到的薪资数据
    result = self.mysql_session.query(Lagoutables.jobNature).filter(Lagoutables.crawl_date == self.date).all()
    # 处理原始数据
    result_list1 = [x[0] for x in result]
    # 计数,并返回
    result_list2 = [x for x in Counter(result_list1).items()]
    result = [{"name": x[0], "value": x[1]} for x in result_list2]
    name_list = [name['name'] for name in result]
    info['x_name'] = name_list
    info['data'] = result
    return info

# 抓取数量
def count_result(self):
    info = {}
    info['all_count'] = self.mysql_session.query(Lagoutables).count()
    info['today_count'] = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date==self.date).count()
    return info

三:数据可视化

使用echarts结合Flask进行可视化

@app.route('/get_echart_data')
def get_echart_data():
    info = {}
    # 行业发布数量分析
    info['echart_1'] = lagou_mysql.query_industryfield_result()
    # 薪资发布数量分析
    info['echart_2'] = lagou_mysql.query_salary_result()
    # 岗位数量分析,折线图
    info['echart_4'] = lagou_mysql.query_job_result()
    #工作年限分析
    info['echart_5'] = lagou_mysql.query_workyear_result()
    #学历情况分析
    info['echart_6'] = lagou_mysql.query_education_result()
    #融资情况
    info['echart_31'] = lagou_mysql.query_financestage_result()
    #公司规模
    info['echart_32'] = lagou_mysql.query_companysize_result()
    #岗位要求
    info['echart_33'] = lagou_mysql.query_jobNature_result()
    #各地区发布岗位数
    info['map'] = lagou_mysql.query_city_result()
    return jsonify(info)

可视化效果

可视化页面

CATALOG
  1. 1. 开发环境
  2. 2. 开发流程
    1. 2.1. 一:数据抓取
      1. 2.1.1. 1:使用session保存cookies信息
      2. 2.1.2. 2:获取全国城市列表的方法
      3. 2.1.3. 3:解决服务器返回操作频繁
      4. 2.1.4. 4:使用多进程加速抓取
    2. 2.2. 二:数据存储
      1. 2.2.1. 1:数据表设计
      2. 2.2.2. 2:保存数据到MySQL
      3. 2.2.3. 3:查询并过滤MySQL数据
    3. 2.3. 三:数据可视化
  3. 3. 可视化效果