需求:获取到身份证前6位对应的省市区信息,但是网上找到的不是不全就是要各种积分,只能自己动手了
运行环境为python3.8.1,由于是通过国家统计局来获取的,所以只是跑一边保留数据,毕竟政府网站一直爬也不好,也没有必要,这个东西也不会经常变化。网站上的代码详细到居委会或村,需要截6尾获取,下面为获取结果转到excel里的结果,基本上就可以直接拿来用了

下面为完整的python代码,参考网上各大大佬的代码整理调试
# -*- coding: utf-8 -*-
# author:
"""
通过国家统计局官网获取中国2019年所有城市数据
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
"""
import re
import requests
import time
import sys
fileSavePath = 'F://data/China_Province_2019.txt' # 数据储存路径
fileSavePath2 = 'F://data/China_Province_2019_mistake.txt' # 错误信息储存路径
results2 = []
results3 = []
results4 = []
results5 = []
Dates1 = []
n = 0
# 获取一级城市信息
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url)
response.raise_for_status()
response.encoding = response.apparent_encoding
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result1 = re.findall(pattern,response.text)
for cycle1 in range(len(result1)):
try:
url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{0}'.format(result1[cycle1][0])
address1 = result1[cycle1][1]
response1 = requests.get(url1)
response1.raise_for_status()
response1.encoding = response1.apparent_encoding
response1.close()
pattern1 = re.compile("<a href='(.*?)'>(.*?)<")
result2_1 = list(set(re.findall(pattern1,response1.text)))
result2 = []
for result in result2_1:
if '0' not in result[1]:
result2.append(result)
except:
print("Unexpected error:", sys.exc_info())
with open(fileSavePath2, 'a', encoding='utf-8')as f:
f.write('{0}|一级错误|一级错误|一级错误|{1}\n'.format('xd', sys.exc_info()))
f.close()
time.sleep(1)
continue
for cycle2 in range(len(result2)):
try:
address2 = result2[cycle2][1]
url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{0}'.format(result2[cycle2][0])
response2 = requests.get(url2)
response2.raise_for_status()
response2.encoding = response2.apparent_encoding
response2.close()
pattern2 = re.compile("a href='(.*?)'>(.*?)<")
result3_1 = list(set(re.findall(pattern2,response2.text)))
result3 = []
resultall = []
for result in result3_1:
if '0' not in result[1]:
result3.append(result)
for result in result3:
address3 = result[1]
addressno = result[0][3:9]
address = '{0}|{1}|{2}|{3}'.format(addressno, address1, address2, address3)
print(address)
with open(fileSavePath, 'a', encoding='utf-8')as f:
f.write(address)
f.write('\n')
f.close()
except:
print("Unexpected error:", sys.exc_info())
with open(fileSavePath2, 'a', encoding='utf-8')as f:
f.write('{0}|二级错误|二级错误|二级错误|{1}\n'.format(address1, sys.exc_info()))
f.close()
time.sleep(1)
continue
print('well_done')