爬取上海房天下房屋信息

May 23, 2017


#爬取上海房天下房屋信息 前几天做了一个拉取上海房价的脚本,保存为excel文件

#! -*-coding:utf-8-*-
# Function: 房价
# Author:zinber

from bs4 import BeautifulSoup as BS
from multiprocessing import Pool
import re
import lxml
import datetime
import cProfile
import socket
import copy
import xlsxwriter
import requests
import bs4
import time


starttime = datetime.datetime.now()
base_url=r'http://esf.sh.fang.com'
house_param=r'c2250-d2300-g22-j250-k270-r25-s210'
cur_local_time=time.localtime(time.time())
file_name="{year}{month:02}{day:02}.xls".format(year=cur_local_time.tm_year,month=cur_local_time.tm_mon,day=cur_local_time.tm_mday)

# test_search_dict = {'昌平': {'霍营': {'13号线': 'http://bj.fangjia.com/ershoufang/--r-%E6%98%8C%E5%B9%B3|w-13%E5%8F%B7%E7%BA%BF|b-%E9%9C%8D%E8%90%A5'}}}

search_list = []  # 房源信息url列表
tmp_list = []  # 房源信息url缓存列表
layer = -1
#houseList

# 获取列表页面
def get_page(url):
    headers = {
        'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Referer': r'http://esf.sh.fang.com/house-a031/c2250-d2300-g22-j250-k270-r25-s210/',
        'Host': r'esf.sh.fang.com',
        'Connection': 'keep-alive',
        'Content-Type': 'text/html; charset=gb2312',
    }
    r=requests.get(url,headers=headers)
    # print(r.url,r.status_code,r.encoding)
    return r.text


# 获取查询关键词dict
def get_search(page, key):
    print("get_search--key:%s"%(key))
    soup = BS(page, 'html.parser')
    search_list = soup.find_all(href=re.compile(key), target='')
    search_dict = {}
    for i in range(len(search_list)):
        soup = BS(str(search_list[i]), 'html.parser')
        key = soup.select('a')[0].get_text()
        value = soup.a.attrs['href']
        search_dict[key] = value
    print("get_search-----seatch_dict:",search_dict)
    return search_dict


# 获取房源信息列表(嵌套字典遍历)
def get_info_list(search_dict, layer, tmp_list, search_list):
    layer += 1  # 设置字典层级
    for i in range(len(search_dict)):
        tmp_key = list(search_dict.keys())[i]  # 提取当前字典层级key
        tmp_list.append(tmp_key)   # 将当前key值作为索引添加至tmp_list
        tmp_value = search_dict[tmp_key]
        if isinstance(tmp_value, str):   # 当键值为url时
            tmp_list.append(tmp_value)   # 将url添加至tmp_list
            search_list.append(copy.deepcopy(tmp_list))   # 将tmp_list索引url添加至search_list
            tmp_list = tmp_list[:layer]  # 根据层级保留索引
        elif tmp_value == '':   # 键值为空时跳过
            layer -= 2           # 跳出键值层级
            tmp_list = tmp_list[:layer]   # 根据层级保留索引
        else:
            get_info_list(tmp_value, layer, tmp_list, search_list)  # 当键值为列表时,迭代遍历
            tmp_list = tmp_list[:layer]
    return search_list


# 获取房源信息详情
def get_info_pn_list(search_list,deep):
    fin_search_list = []
    for i in range(len(search_list)):
        print('>>>正在抓取%s' % search_list[i][:deep])
        search_url = search_list[i][deep]
        print('search_url:',search_url)
        try:
            page = get_page(search_url)
        except:
            print('获取页面超时')
            continue
        soup = BS(page, 'html.parser')
        # 获取最大页数
        # print(soup)
        # pn_num = soup.select('span[class="mr5"]')[0].get_text()
        # rule = re.compile(r'\d+')
        # max_pn = int(rule.findall(pn_num)[1])
        # 组装url
        for pn in range(1, 11):
            print('************************正在抓取%s页************************' % pn)
            pn_rule = re.compile('[|]')
            fin_url = pn_rule.sub(r'|e-%s|' % pn, search_url, 1)
            tmp_url_list = copy.deepcopy(search_list[i][:deep])
            tmp_url_list.append(fin_url)
            fin_search_list.append(tmp_url_list)
    return fin_search_list


# 获取tag信息
def get_info(url_list, process_i):
    # print('进程%s开始' % process_i)
    fin_info_list = []
    # print('url_list:',url_list)
    for i in range(len(url_list)):
        try:
            page = get_page(url_list[i])
        except:
            print('error')
            continue
        soup = BS(page, 'html.parser')

        title_list = soup.select('p[class="title"]')
        # print('title_list',len(title_list))
        address_list = soup.select('p[class="mt10"]')
        # print('address_list',len(address_list))
        attr_list = soup.select('p[class="mt12"]')
        # print('attr_list',len(attr_list))
        area_list = soup.select('div[class="area alignR"]')
        # print('area_list',len(area_list))
        more_info = soup.select('div[class="moreInfo"]')
        # print('more_info',len(more_info))
    #     price_list = soup.find_all(attrs={"class": "xq_aprice xq_esf_width"})  # select对于某些属性值(属性值中间包含空格)无法识别,可以用find_all(attrs={})代替
        for num in range(len(title_list)):
            tag_tmp_list = []
            try:
                link=''
                title=''
                for child in title_list[num].children:
                    if isinstance(child, bs4.element.Tag):
                        if 'href' in child.attrs:
                            link=base_url+child.attrs.get('href')
                    text=child.string.strip()
                    if len(text) > 0:
                        title=title+text+';'

                print(r'************************正在获取%s************************' % title)
                # print('--title_list',title)
                
                address=''
                for child in address_list[num].children:
                    text=child.string.strip()
                    if len(text) > 0:
                        address=address+text+';'
                # print('--area_list'+address)
                
                attr=''
                for child in attr_list[num].children:
                    text=child.string.strip()
                    if len(text) > 0:
                        attr=attr+text+';'
                # print('--attr-list',attr)
                
                area=''
                for child in area_list[num].children:
                    text=child.string.strip()
                    totalCount = re.sub("\D", "", text)
                    if totalCount :
                        area='{}平米'.format(totalCount)
                # print('--area_list',area)
                
                more=''
                for child in more_info[num].children:
                    if isinstance(child, bs4.element.Tag):
                        for content in child.contents:
                            text=content.string.strip()
                            if len(text) > 0:
                                more=more+text
                    more=more+';'
                # print('--more:',more)
                
                
                # tag_tmp_list = copy.deepcopy(fin_search_list[i][:deep])
                for tag in [title, address, attr, area, more, link]:
                    tag_tmp_list.append(tag)
                fin_info_list.append(tag_tmp_list)
            except:
                print('【抓取失败】')
                continue
    print('进程%s结束' % process_i)
    return fin_info_list


# 分配任务
def assignment_search_list(fin_search_list, project_num):  # project_num每个进程包含的任务数,数值越小,进程数越多
    assignment_list = []
    fin_search_list_len = len(fin_search_list)
    for i in range(0, fin_search_list_len, project_num):
        start = i
        end = i+project_num
        assignment_list.append(fin_search_list[start: end])  # 获取列表碎片
    return assignment_list


# 存储抓取结果
def save_excel(sheet_name,fin_info_list):
    if len(fin_info_list) > 0:
        # tag_name = ['区域', '板块', '地铁', '标题', '位置', '平米', '户型', '楼层', '总价', '单位平米价格']
        tag_name = ['标题', '位置', '房屋属性', '建筑面积', '价格', '链接']
         # 默认存储在桌面上
        tmp = book.add_worksheet(sheet_name)
        tmp.write_row('A1', tag_name)
        row_num = len(fin_info_list)
        for i in range(0, row_num):
            con_pos = 'A%s' % (i+2)
            content = fin_info_list[i]  # -1是因为被表格的表头所占
            tmp.write_row(con_pos, content)
    

def get_page_list(page,srcUrl):
    soup = BS(page, 'html.parser')
    ret_list=[srcUrl]
    txt =soup.find("span", {"class":"txt"})
    if txt:
        list_txt = soup.find("span", {"class":"txt"}).get_text().strip()
        page_numbers=int(list_txt[1:-1])
        for i in range(1,page_numbers):
            ret_list.append(srcUrl+'-i3{}'.format(i+1))
    return ret_list

def get_section(page):
    soup=BS(page,'html.parser')
    ret_dict={}
    section_list = soup.select('div[class="qxName"]')
    for ele in section_list:
        for child in ele.children:
            if isinstance(child, bs4.element.Tag):
                if 'href' in child.attrs:
                    ret_dict[child.string]=base_url+child.attrs.get('href')+house_param
    return ret_dict


if __name__ == '__main__':
    book = xlsxwriter.Workbook(file_name) 
    base_page=get_page(base_url)
    sections=get_section(base_page)
    for k,v in sections.items():
        fin_save_list = []  # 抓取信息存储列表
        page = get_page(v)
        page_list=get_page_list(page,v)
        
        p = Pool(4)  # 设置进程池
        assignment_list = assignment_search_list(page_list, 2)  # 分配任务,用于多进程
        result = []  # 多进程结果列表
        for i in range(len(assignment_list)):
            result.append(p.apply_async(get_info, args=(assignment_list[i], i)))
        p.close()
        p.join()
        for result_i in range(len(result)):
            fin_info_result_list = result[result_i].get()
            fin_save_list.extend(fin_info_result_list)  # 将各个进程获得的列表合并
        save_excel(k,fin_save_list)
    book.close()
    endtime = datetime.datetime.now()
    time = (endtime - starttime).seconds
    print('总共用时:%s s' % time)