python爬虫获取身份证前6位对应的省市区信息

需求:获取到身份证前6位对应的省市区信息,但是网上找到的不是不全就是要各种积分,只能自己动手了


运行环境为python3.8.1,由于是通过国家统计局来获取的,所以只是跑一边保留数据,毕竟政府网站一直爬也不好,也没有必要,这个东西也不会经常变化。网站上的代码详细到居委会或村,需要截6尾获取,下面为获取结果转到excel里的结果,基本上就可以直接拿来用了

下面为完整的python代码,参考网上各大大佬的代码整理调试

# -*- coding: utf-8 -*-
# author:

"""
通过国家统计局官网获取中国2019年所有城市数据
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
"""
import re
import requests
import time
import sys

fileSavePath = 'F://data/China_Province_2019.txt'  # 数据储存路径
fileSavePath2 = 'F://data/China_Province_2019_mistake.txt'  # 错误信息储存路径
results2 = []
results3 = []
results4 = []
results5 = []
Dates1 = []


n = 0
# 获取一级城市信息
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url)
response.raise_for_status()
response.encoding = response.apparent_encoding
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result1 = re.findall(pattern,response.text)

for cycle1 in range(len(result1)):
    try:
        url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{0}'.format(result1[cycle1][0])
        address1 = result1[cycle1][1]
        response1 = requests.get(url1)
        response1.raise_for_status()
        response1.encoding = response1.apparent_encoding
        response1.close()
        pattern1 = re.compile("<a href='(.*?)'>(.*?)<")
        result2_1 = list(set(re.findall(pattern1,response1.text)))
        result2 = []
        for result in result2_1:
            if '0' not in result[1]:
                result2.append(result)
    except:
        print("Unexpected error:", sys.exc_info())
        with open(fileSavePath2, 'a', encoding='utf-8')as f:
            f.write('{0}|一级错误|一级错误|一级错误|{1}\n'.format('xd', sys.exc_info()))
            f.close()
        time.sleep(1)
        continue

    for cycle2 in range(len(result2)):
        try:
            address2 = result2[cycle2][1]
            url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{0}'.format(result2[cycle2][0])
            response2 = requests.get(url2)
            response2.raise_for_status()
            response2.encoding = response2.apparent_encoding
            response2.close()
            pattern2 = re.compile("a href='(.*?)'>(.*?)<")
            result3_1 = list(set(re.findall(pattern2,response2.text)))
            result3 = []
            resultall = []
            for result in result3_1:
                if '0' not in result[1]:
                    result3.append(result)
            for result in result3:
                address3 = result[1]
                addressno = result[0][3:9]
                address = '{0}|{1}|{2}|{3}'.format(addressno, address1, address2, address3)
                print(address)
                with open(fileSavePath, 'a', encoding='utf-8')as f:
                    f.write(address)
                    f.write('\n')
                    f.close()
        except:
            print("Unexpected error:", sys.exc_info())
            with open(fileSavePath2, 'a', encoding='utf-8')as f:
                f.write('{0}|二级错误|二级错误|二级错误|{1}\n'.format(address1, sys.exc_info()))
                f.close()
            time.sleep(1)
            continue


print('well_done')

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注