从统计局采集最新的省市区县数据,纯js
本文更新(移步查阅):
19-04-15 新采集了2018的省市区三级坐标和行政区域边界
19-03-22 采集了2018的城市数据
18-11-28 采集了2017的城市数据数据下载 GitHub:https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases
相关更新情况,请查阅我发布的其他文章,本文以下内容不再更新。
18-01-28早上6:30的火车,从三亚回老家,票难买啊。好激动~
声明:文中涉及到的数据和第三方接口、url仅供学习使用,请勿它用~
这几天都在磨着搭建本地测试环境,看到省市区数据表里面是空的,想着以前的老数据还是13年采集的,含省市区县4级数据共4.8万条,时间久了,使用过程中发现有些新的城市名称数据库中没有,县级数据从来就没有用到过,想着还是重新采集一份。
新采集的省市区数据有3589条,这次并没有把县级数据采过来,需要的时候再添加也挺好。
数据来源
国家统计局统计标准《2016年统计用区划代码和城乡划分代码(截止2016年07月31日)》,这个是2017-05-16发布的,当前是最新的。

数据采集
对于数据采集,根据工作需要,对于一些小的数据采集功能有些接触。因为对html和js熟些,很早以前就用IE浏览器对本地html文件支持任意跨域ajax请求数据、和支持读写Excel文件,就直接写一个html文件作为采集工具给别人使用,批量查询人员资料、考试结果什么的功能。所以采集省市区数据主要用的js。
1. 抓取原始数据
打开网页http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
省份的数据就有了,进入市级页面,然后进入区级页面,还可以进入县级页面。整个流程地址结构非常简单,数据格式也很好提取。
进入网页后打开浏览器控制台,执行下面代码,这段代码仅仅包含采集省市区的,把县级的阉割掉了,13年的老代码有县级的。很早以前写的代码,风格有点丑,不过能能正常使用就是好的,这个采集是“单线程的”,因为这些数据少,速度并不慢:
/* 获取城市名称http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html */ (function(){ if(!window.URL){ throw new Error("浏览器版本太低"); }; function ajax(url,True,False){ var ajax=new XMLHttpRequest(); ajax.timeout=1000; ajax.open("GET",url); ajax.onreadystatechange=function(){ if(ajax.readyState==4){ if(ajax.status==200){ True(ajax.responseText); }else{ False(); } } } ajax.send(); } function msg(){ console.log.apply(console, arguments); } function cityClass(name,url,code){ this.name=name; this.url=url; this.code=code; this.child=[]; this.tryCount=0; } cityClass.prototype={ getValue:function(){ var obj={name:this.name,code:this.code,child:[]}; for(var i=0;i(.+?)
"))+1){ reg.lastIndex=idx; while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){ url=path+"/"+url; } var name=match[2]; DATA.push(new cityClass(name,url,0)); } True(); }else{ msg("未发现省份数据"); } },function(){ msg("读取省份列表出错","程序终止"); }); } function load_shen(True, False){ var city=DATA[JD.shen]; city.tryCount++; if(city.tryCount>3){ msg("读取省份["+city.name+"]超过3次"); False(); return; }; function get(){ msg("读取省份["+city.name+"]", getJD()); save(); city.child[JD.si].tryCount=0; load_si(function(){ JD.shen++; if(JD.shen>=DATA.length){ JD.shen=0; True(); return; }; DATA[JD.shen].tryCount=0; load_shen(True,False); },function(){ False(); }); } if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/.+?href='(.+?)'>(.+?)(.+?); var match; while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){ url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var code=match[2]; var name=match[3]; city.child.push(new cityClass(name,url,code)); } JD.si=0; get(); },function(){ load_shen(True,False); }); }; } function load_si(True,False){ var shen=DATA[JD.shen]; var city=shen.child[JD.si]; city.tryCount++; if(city.tryCount>3){ msg("读取城市["+city.name+"]超过3次"); False(); return; }; function get(){ msg("___读取城市["+city.name+"]", getJD()); city.child[JD.xian].tryCount=0; JD.si++; if(JD.si>=shen.child.length){ JD.si=0; True(); return; }; shen.child[JD.si].tryCount=0; load_si(True,False); } if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/class='(?:countytr|towntr)'.+?/ig; var match; while(match=reg.exec(text)){ var reg2=/class='(?:countytr|towntr)'.+?(?: (.+?)(.+?)(.+?)(.+?) 采集截图: 2. 处理数据和拼音标注
数据处理就简单些了,比如编号格式化、名称格式化等。 拼音标注:这个需要找一个接口对文字进行拼音翻译,只有一个要求:重庆能正常的翻译成chong qing即可,翻译成zhong qing的就low了。满足这个条件,百度上搜索到的翻译小网站80%就被干掉了。 浏览器中打开找到的翻译接口http://www.qqxiuzi.cn/zh/pinyin/
,截止到目前是能正常调用的,因为要用ajax请求数据,在页面里面就没有跨域的问题,查看网页源码,把token值记录下来,这个网站翻译请求需要带这个token,注意~刷新页面要重新获取:
拼音这个因为数据量比较多,采用了“4个线程”采集,先把第一步采集到的文件打开,把数据复制到打开的翻译网站浏览器控制台里面执行(相当于把数据导入),然后执行下面代码:/* 拼音翻译 http://www.qqxiuzi.cn/zh/pinyin/ http://www.qqxiuzi.cn/zh/pinyin/show.php POST t=汉字&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token=页面token请求一次获取 先加载数据 控制台输入data.txt */ window.PageToken=window.PageToken||""; var FixTrim=function(name){ return name.replace(/^s+|s+/g,""); }; var CITY_LIST2; var QueryPinYin=function(end){ if(!window.PageToken){ console.error("Need PageToken"); return; }; var ids=[]; var fixCode=function(o){ if(o.deep==0){ o.orgCode="0"; }else{ o.orgCode=o.code; if(o.deep==1){ o.code=o.code.substr(o,4); }else{ o.code=o.code.replace(/(000000|000)/g,"");//有少部分区多3位 }; }; return o; }; var fix=function(o,p){ var name=o.name; if(o.deep==0){ name=name.replace(/(市|省|(维吾尔|壮族|回族)?自治区)/ig,""); }else if(o.deep==1){ if(name=="市辖区"){ name=p.o2.name; }else if(/行政区划/ig.test(name)){ name="直辖市"; }else if(name.length>2){ name=name.replace(/市/ig,""); }; }else{ if(name.length>2 && name!="市辖区" && !/(自治.|地区|矿区)/.test(name)){//直接排除会有同名的 name=name.replace(/(市|区|县|镇|管委会|街道办事处)/ig,""); }; }; var o2={ name:name ,ext_name:o.name ,id:+o.code||0 ,ext_id:+o.orgCode ,pid:p&&+p.code||0 ,deep:o.deep }; o.o2=o2; return o2; }; for(var i=0;i=ids.length){ thread--; if(thread==0){ end(); }; return; }; var idx_=idx; var id=ids[idx]; if(id.P){ stack++; if(stack%50==0){ setTimeout(function(){run()}); }else{ run(stack); }; return; }; var name=id.name; var tryCount=0; var tryLoad=function(){.ajax({ url:"/zh/pinyin/show.php" ,data:"t="+encodeURIComponent(name)+"&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token="+PageToken ,type:"POST" ,dataType:"text" ,timeout:1000 ,error:function(e){ if(tryCount>3){ console.error("--QueryPinYin error--"+e); run(); return; }; tryCount++; tryLoad(); } ,success:function(txt){ txt=FixTrim(txt.replace(//g,"").replace(/s+/g," ")); id.P=txt; console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --"); run(); } }); }; tryLoad(); }; var thread=4; run(); run(); run(); run(); }; var ViewDown=function(){ console.log("完成:"+(Date.now()-RunPinYin.T1)/1000+"秒"); window.CITY_LIST_PINYIN=CITY_LIST2; var url=URL.createObjectURL( new Blob([ new Uint8Array([0xEF,0xBB,0xBF]) ,"var CITY_LIST_PINYIN=" ,JSON.stringify(CITY_LIST2,null,"t") ] ,{"type":"text/plain"}) ); var downA=document.createElement("A"); downA.innerHTML="下载查询好城市的文件"; downA.href=url; downA.download="data-pinyin.txt"; document.body.appendChild(downA); downA.click(); }; var RunPinYin=function(){ RunPinYin.T1=Date.now(); QueryPinYin(ViewDown); }; //立即执行代码 if(window.CITY_LIST){ if(!PageToken){ PageToken=prompt("Token"); }; RunPinYin(); }else{ console.error("data.txt未输入"); };
这时候会提示输入token,把刚才找到的token粘贴进去,然后就开始工作了:
还挺快的,2分钟多点全部翻译完成。
3. 格式化成CSV
数据全部有了,导出成比较正常使用的格式,CSV最好了。这个导出比较简单,任意网页控制台把第二部保存的文件打开,复制数据到任意网页控制台,然后输入以下代码:
/* 格式并且输出为csv 先加载数据 控制台输入data-pinyin.txt 导入数据库: 文件格式Unicode,文字为字符流 检查id重复项,修正id 转入area_city 增加港澳台、海外两个省级 检查名称重复项,修正名称 select * from area_city where len(name)=1 select pid,name,count(*) from area_city group by pid,name having COUNT(*)>1 */ var FixTrim=function(name){ return name.replace(/^s+|s+$/g,""); }; function CSVName(name){ return '"'+FixTrim(name).replace(/"/g,'""')+'"'; }; var CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"]; for(var i=0;i
OK,数据全部搞完:
数据问题
- id编号和国家统计局的编号基本一致,方便以后更新。
id重复项目前是没有(已优化过了),不过以前采集后直接对统计局的编号进行简单缩短后会有重复现象(算是精度丢失)。
拼音前缀取的是第一个字前两个字母和后两个字首字母,意图是让第一个字相同名称的尽量能排序在一起。排序1:
黑龙江helj、湖北hub、湖南hun
;排序2:湖北hb、黑龙江hlj、湖南hn
,排序一胜出。因为区名字是直接去掉市、区后缀,存在那么几对名字变得完全一样的,需要手动吧市区后缀加上,不然会产生小问题。
最终数据已上传了一份到CSDN,含所有代码和本文档:,GitHub下载最新数据http://download.csdn.net/download/xiangyuecn/10226964
推荐阅读更多精彩内容
uni-app之定时器setInterval()爱吃鱼_自由阅读 435评论 1赞 0 利用python爬虫获取全国五级地址1、抓取省级地址 2019年数据[http://www.stats.gov.cn/tjsj/tjbz/tjyqhd...暮雨朝烟阅读 151评论 0赞 6 xlwings-让Excel好用到飞起!最近发现一个好用的Python库——xlwings,它可以很容易的使用Python操作Excel,也可以从Exce...疯狂大宝贝儿阅读 1,252评论 1赞 18 uniapp 操作通信录Contacts模块管理系统通讯录,用于可对系统通讯录进行增、删、改、查等操作。通过plus.contacts获取...见月荒州阅读 144评论 0赞 0 接口测试工具Web接口测试工具---Poster与Postman[http://www.cnblogs.com/fnng/p/...曹元_阅读 378评论 0赞 6赞1赞赞赏下载App{"dataManager":"[]","props":{"isServer":true,"initialState":{"global":{"done":false,"artFromType":null,"fontType":"black","modal":{"ContributeModal":false,"RewardListModal":false,"PayModal":false,"CollectionModal":false,"LikeListModal":false,"ReportModal":false,"QRCodeShareModal":false,"BookCatalogModal":false,"RewardModal":false},"ua":{"value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36","isIE11":false,"earlyIE":null,"chrome":"58.0","firefox":null,"safari":null,"isMac":false},"diamondRate":{"displayable":false,"rate":0},"readMode":"day","locale":"zh-CN","seoList":[{"comments_count":1,"public_abbr":"setInterval()","share_image_url":"https://upload-images.jianshu.io/upload_images/8635312-f76af2bab16f2dc2.png","slug":"962f1e1ba77f","user":{"id":8635312,"nickname":"爱吃鱼_自由","slug":"34492e8d1395","avatar":"https://upload.jianshu.io/users/upload_avatars/8635312/9261b2b5-ae06-4946-9a0f-0add546bd5f0.png"},"likes_count":0,"title":"uni-app之定时器","id":80946474,"views_count":435},{"comments_count":0,"public_abbr":"1、抓取省级地址 2019年数据[http://www.stats.gov.cn/tjsj/tjbz/tjyqhd...","share_image_url":"https://upload-images.jianshu.io/upload_images/24894327-952d378c1fec3ec7.png","slug":"95a1b7386598","user":{"id":24894327,"nickname":"暮雨朝烟","slug":"b9bea5ad67fb","avatar":"https://upload.jianshu.io/users/upload_avatars/24894327/35db4c91-7ff0-4599-816b-c878cdbbd241"},"likes_count":6,"title":"利用python爬虫获取全国五级地址","id":78267369,"views_count":151},{"comments_count":1,"public_abbr":"最近发现一个好用的Python库——xlwings,它可以很容易的使用Python操作Excel,也可以从Exce...","share_image_url":"https://upload-images.jianshu.io/upload_images/20543630-9d93dab6403ba08b.png","slug":"8f31dfc33355","user":{"id":20543630,"nickname":"疯狂大宝贝儿","slug":"84aa4767fdf2","avatar":"https://upload.jianshu.io/users/upload_avatars/20543630/b9ee8f12-4c40-4490-8994-b5a0e4a6b2af.jpeg"},"likes_count":18,"title":"xlwings-让Excel好用到飞起!","id":77221871,"views_count":1252},{"comments_count":0,"public_abbr":"Contacts模块管理系统通讯录,用于可对系统通讯录进行增、删、改、查等操作。通过plus.contacts获取...","share_image_url":"","slug":"a10efe33fe1c","user":{"id":7197012,"nickname":"见月荒州","slug":"e42e68857b67","avatar":"https://upload.jianshu.io/users/upload_avatars/7197012/3d4dcd18-e6fa-4567-bf25-28036ee12016"},"likes_count":0,"title":"uniapp 操作通信录","id":79516424,"views_count":144},{"comments_count":0,"public_abbr":"Web接口测试工具---Poster与Postman[http://www.cnblogs.com/fnng/p/...","share_image_url":"https://upload-images.jianshu.io/upload_images/23770617-3347eef11edd08d6.png","slug":"b39a40e4401a","user":{"id":23770617,"nickname":"曹元_","slug":"3bf86d473626","avatar":"https://upload.jianshu.io/users/upload_avatars/23770617/db472dfb-062c-440e-924b-ba8c3348ecde.jpg"},"likes_count":6,"title":"接口测试工具","id":76436630,"views_count":378}]},"note":{"data":{"is_author":false,"last_updated_at":1555373516,"public_title":"从统计局采集最新的省市区县数据,纯js","purchased":false,"liked_note":false,"comments_count":0,"free_content":"u003cblockquoteu003enu003cpu003e本文更新(移步查阅):u003cbru003en19-04-15 u003ca href="https://www.jianshu.com/p/e200899f1e3a" target="_blank"u003e新采集了2018的省市区三级坐标和行政区域边界u003c/au003eu003cbru003en19-03-22 u003ca href="https://www.jianshu.com/p/c3f7ef149ea7" target="_blank"u003e采集了2018的城市数据u003c/au003eu003cbru003en18-11-28 采集了2017的城市数据u003c/pu003enu003cpu003e数据下载 GitHub:u003ca href="https://links.jianshu.com/go?to=https%3A%2F%2Fgithub.com%2Fxiangyuecn%2FAreaCity-JsSpider-StatsGov%2Freleases" target="_blank" rel="nofollow"u003ehttps://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releasesu003c/au003eu003cbru003en相关更新情况,请查阅我发布的其他文章,本文以下内容不再更新。u003c/pu003enu003c/blockquoteu003enu003cblockquoteu003enu003cpu003e18-01-28早上6:30的火车,从三亚回老家,票难买啊。好激动~u003cbru003en声明:文中涉及到的数据和第三方接口、url仅供学习使用,请勿它用~u003c/pu003enu003c/blockquoteu003enu003cpu003e这几天都在磨着搭建本地测试环境,看到省市区数据表里面是空的,想着以前的老数据还是13年采集的,含省市区县4级数据共4.8万条,时间久了,使用过程中发现有些新的城市名称数据库中没有,县级数据从来就没有用到过,想着还是重新采集一份。u003c/pu003enu003cpu003e新采集的省市区数据有3589条,这次并没有把县级数据采过来,需要的时候再添加也挺好。u003c/pu003enu003ch1u003e数据来源u003c/h1u003enu003cpu003e国家统计局统计标准《2016年统计用区划代码和城乡划分代码(截止2016年07月31日)》,这个是2017-05-16发布的,当前是最新的。u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 592px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 57.03%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1038" data-height="592"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-5199f96a123a2c9b.png" data-original-width="1038" data-original-height="592" data-original-format="" data-original-filesize="216051"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch1u003e数据采集u003c/h1u003enu003cpu003e对于数据采集,根据工作需要,对于一些小的数据采集功能有些接触。因为对html和js熟些,很早以前就用IE浏览器对本地html文件支持任意跨域ajax请求数据、和支持读写Excel文件,就直接写一个html文件作为采集工具给别人使用,批量查询人员资料、考试结果什么的功能。所以采集省市区数据主要用的js。u003c/pu003enu003ch2u003e1. 抓取原始数据u003c/h2u003enu003cpu003e打开网页u003ccodeu003ehttp://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.htmlu003c/codeu003e省份的数据就有了,进入市级页面,然后进入区级页面,还可以进入县级页面。整个流程地址结构非常简单,数据格式也很好提取。u003c/pu003enu003cpu003e进入网页后打开浏览器控制台,执行下面代码,这段代码仅仅包含采集省市区的,把县级的阉割掉了,13年的老代码有县级的。很早以前写的代码,风格有点丑,不过能能正常使用就是好的,这个采集是“单线程的”,因为这些数据少,速度并不慢:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n获取城市名称http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.htmln*/n(function(){nif(!window.URL){n throw new Error("浏览器版本太低");n};nfunction ajax(url,True,False){n var ajax=new XMLHttpRequest();n ajax.timeout=1000;n ajax.open("GET",url);n ajax.onreadystatechange=function(){n if(ajax.readyState==4){n if(ajax.status==200){n True(ajax.responseText);n }else{n False();n }n }n }n ajax.send();n}nfunction msg(){n console.log.apply(console, arguments);n}nnfunction cityClass(name,url,code){n this.name=name;n this.url=url;n this.code=code;n this.child=[];n this.tryCount=0;n}ncityClass.prototype={n getValue:function(){n var obj={name:this.name,code:this.code,child:[]};n for(var i=0;iu0026lt;this.child.length;i++){n obj.child.push(this.child[i].getValue());n }n return obj;n }n}nnfunction load_all(True){n var path="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016";n ajax(path+"/index.html",function(text){n var reg=/href='(.+?)'u0026gt;(.+?)u0026lt;br/ig,match;n var idx;n if((idx=text.indexOf("u0026lt;tr class='provincetr'u0026gt;"))+1){n reg.lastIndex=idx;n while(match=reg.exec(text)){n var url=match[1];n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=path+"/"+url;n }n var name=match[2];n DATA.push(new cityClass(name,url,0));n }n True();n }else{n msg("未发现省份数据");n }n },function(){n msg("读取省份列表出错","程序终止");n });n}nfunction load_shen(True, False){n var city=DATA[JD.shen];n city.tryCount++;n if(city.tryCountu0026gt;3){n msg("读取省份["+city.name+"]超过3次");n False();n return;n };n n function get(){n msg("读取省份["+city.name+"]", getJD());n save();n n city.child[JD.si].tryCount=0;n load_si(function(){n JD.shen++;n if(JD.shenu0026gt;=DATA.length){n JD.shen=0;n True();n return;n };n DATA[JD.shen].tryCount=0;n n load_shen(True,False);n },function(){n False();n });n }n n if(city.child.length){n get();n }else{n ajax(city.url,function(text){n var reg=/u0026lt;tr class='citytr'u0026gt;.+?href='(.+?)'u0026gt;(.+?)u0026lt;.+?'u0026gt;(.+?)u0026lt;/ig;n var match;n while(match=reg.exec(text)){n var url=match[1];n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url;n }n var code=match[2];n var name=match[3];n city.child.push(new cityClass(name,url,code));n }n n JD.si=0;n get();n },function(){n load_shen(True,False);n });n };n}nnfunction load_si(True,False){n var shen=DATA[JD.shen];n var city=shen.child[JD.si];n city.tryCount++;n if(city.tryCountu0026gt;3){n msg("读取城市["+city.name+"]超过3次");n False();n return;n };n n n function get(){n msg("___读取城市["+city.name+"]", getJD());n n city.child[JD.xian].tryCount=0;n JD.si++;n if(JD.siu0026gt;=shen.child.length){n JD.si=0;n True();n return;n };n shen.child[JD.si].tryCount=0;n n load_si(True,False);n }n n if(city.child.length){n get();n }else{n ajax(city.url,function(text){n var reg=/class='(?:countytr|towntr)'.+?u0026lt;\/tru0026gt;/ig;n var match;n while(match=reg.exec(text)){n var reg2=/class='(?:countytr|towntr)'.+?(?:u0026lt;tdu0026gt;u0026lt;a href='(.+?)'u0026gt;(.+?)u0026lt;.+?'u0026gt;(.+?)u0026lt;|u0026lt;tdu0026gt;(.+?)u0026lt;.+?u0026lt;tdu0026gt;(.+?)u0026lt;)/ig;n var match2;n if(match2=reg2.exec(match[0])){n var url=match2[1]||"";n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url;n }n var code=match2[2]||match2[4];n var name=match2[3]||match2[5];n city.child.push(new cityClass(name,url,code));n }else{n msg("未知城市模式:");n msg(city.url);n msg(match[0]);n throw new Error("end");n }n }n n JD.xian=0;n get();n },function(){n load_si(True,False);n });n };n}nnnfunction getJD(){n var str="省:"+(JD.shen+1)+"/"+DATA.length;n var shen=DATA[JD.shen];n if(shen){n str+=" 市:"+(JD.si+1)+"/"+shen.child.length;n var si=shen.child[JD.si];n if(si){n str+=" 县:"+(JD.xian+1)+"/"+si.child.length;n }else{n str+=" 县:"+JD.xian;n }n }else{n str+=" 市:"+JD.si+" 县:"+JD.xian;n }n return str;n}nfunction save(){n n}nnvar DATA=[];nvar JD;nwindow.RunLoad=function(shen,si,xian){n RunLoad.T1=Date.now();n JD={n shen:shen||0n ,si:si||0n ,xian:xian||0n }n n function get(){n DATA[JD.shen].tryCount=0;n load_shen(function(){n console.log("完成:"+(Date.now()-RunLoad.T1)/1000+"秒");n save();n n var data=[];n for(var i=0;iu0026lt;DATA.length;i++){n data.push(DATA[i].getValue());n }n n var url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,"var CITY_LIST="n ,JSON.stringify(data,null,"\t")n ]n ,{"type":"text/plain"})n );n var downA=document.createElement("A");n downA.innerHTML="下载查询好城市的文件";n downA.href=url;n downA.download="data.txt";n document.body.appendChild(downA);n downA.click();n n msg("--完成--");n },function(){n save();n msg("当前进度:", getJD());n });n }n n var data=localStorage["load_data"];n if(data){n DATA=JSON.parse(data);n get();n }else{n load_all(get);n }n}n})();//@ sourceURL=console.jsnnn//立即执行代码nRunLoad()nu003c/codeu003eu003c/preu003enu003cpu003e采集截图:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 476px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 67.86999999999999%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1083" data-height="735"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-d67da80d59b94f3e.png" data-original-width="1083" data-original-height="735" data-original-format="" data-original-filesize="369795"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch2u003e2. 处理数据和拼音标注u003c/h2u003enu003cpu003e数据处理就简单些了,比如编号格式化、名称格式化等。u003c/pu003enu003cpu003e拼音标注:这个需要找一个接口对文字进行拼音翻译,只有一个要求:重庆能正常的翻译成chong qing即可,翻译成zhong qing的就low了。满足这个条件,百度上搜索到的翻译小网站80%就被干掉了。u003c/pu003enu003cpu003e浏览器中打开找到的翻译接口u003ccodeu003ehttp://www.qqxiuzi.cn/zh/pinyin/u003c/codeu003e,截止到目前是能正常调用的,因为要用ajax请求数据,在页面里面就没有跨域的问题,查看网页源码,把token值记录下来,这个网站翻译请求需要带这个token,注意~刷新页面要重新获取:u003cbru003enu003c/pu003eu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 464px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 44.57%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1041" data-height="464"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-3569fb95be67eef8.png" data-original-width="1041" data-original-height="464" data-original-format="" data-original-filesize="81218"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003eu003cpu003eu003c/pu003enu003cpu003e拼音这个因为数据量比较多,采用了“4个线程”采集,先把第一步采集到的文件打开,把数据复制到打开的翻译网站浏览器控制台里面执行(相当于把数据导入),然后执行下面代码:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n拼音翻译nhttp://www.qqxiuzi.cn/zh/pinyin/nnhttp://www.qqxiuzi.cn/zh/pinyin/show.phpnPOSTnt=汉字u0026amp;d=1u0026amp;s=nullu0026amp;k=1u0026amp;b=nullu0026amp;h=nullu0026amp;u=nullu0026amp;v=1u0026amp;y=nullu0026amp;z=nullu0026amp;token=页面token请求一次获取nn先加载数据n 控制台输入data.txtn*/nwindow.PageToken=window.PageToken||"";nvar FixTrim=function(name){n return name.replace(/^\s+|\s+/g,"");n};nvar CITY_LIST2;nvar QueryPinYin=function(end){n if(!window.PageToken){n console.error("Need PageToken");n return;n };n var ids=[];n n var fixCode=function(o){n if(o.deep==0){n o.orgCode="0";n }else{n o.orgCode=o.code;n if(o.deep==1){n o.code=o.code.substr(o,4);n }else{n o.code=o.code.replace(/(000000|000)/g,"");//有少部分区多3位n };n };n return o;n };n var fix=function(o,p){n var name=o.name;n if(o.deep==0){n name=name.replace(/(市|省|(维吾尔|壮族|回族)?自治区)/ig,"");n }else if(o.deep==1){n if(name=="市辖区"){n name=p.o2.name;n }else if(/行政区划/ig.test(name)){n name="直辖市";n }else if(name.lengthu0026gt;2){n name=name.replace(/市/ig,"");n };n }else{n if(name.lengthu0026gt;2 u0026amp;u0026amp; name!="市辖区"n u0026amp;u0026amp; !/(自治.|地区|矿区)/.test(name)){//直接排除会有同名的n name=name.replace(/(市|区|县|镇|管委会|街道办事处)/ig,"");n };n };n var o2={n name:namen ,ext_name:o.namen ,id:+o.code||0n ,ext_id:+o.orgCoden ,pid:pu0026amp;u0026amp;+p.code||0n ,deep:o.deepn };n o.o2=o2;n return o2;n };n for(var i=0;iu0026lt;CITY_LIST.length;i++){n var shen=CITY_LIST[i];n shen.deep=0;n for(var i2=0;i2u0026lt;shen.child.length;i2++){n var si=shen.child[i2];n if(!shen.code){n shen.code=si.code.substr(0,2);n ids.push(fix(fixCode(shen)));n };n si.deep=1;n ids.push(fix(fixCode(si),shen));n n n for(var i3=0;i3u0026lt;si.child.length;i3++){n var qu=si.child[i3];n qu.deep=2;n ids.push(fix(fixCode(qu),si));n };n };n };n CITY_LIST2=ids;n //console.log(JSON.stringify(ids,null,"\t"))n //return;n n var idx=-1;n var run=function(stack){n stack=+stack||0;n idx++;n if(idxu0026gt;=ids.length){n thread--;n if(thread==0){n end();n };n return;n };n n var idx_=idx;n var id=ids[idx];n if(id.P){n stack++;n if(stack%50==0){n setTimeout(function(){run()});n }else{n run(stack);n };n return;n };n n var name=id.name;n var tryCount=0;n var tryLoad=function(){n .ajax({n url:"/zh/pinyin/show.php"n ,data:"t="+encodeURIComponent(name)+"u0026amp;d=1u0026amp;s=nullu0026amp;k=1u0026amp;b=nullu0026amp;h=nullu0026amp;u=nullu0026amp;v=1u0026amp;y=nullu0026amp;z=nullu0026amp;token="+PageTokenn ,type:"POST"n ,dataType:"text"n ,timeout:1000n ,error:function(e){n if(tryCountu0026gt;3){n console.error("--QueryPinYin error--"+e);n run();n return;n };n tryCount++;n tryLoad();n }n ,success:function(txt){n txt=FixTrim(txt.replace(/u0026lt;.+?u0026gt;/g,"").replace(/\s+/g," "));n id.P=txt;n console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --");n run();n }n });n };n tryLoad();n };n n var thread=4;n run();n run();n run();n run();n};nnnvar ViewDown=function(){n console.log("完成:"+(Date.now()-RunPinYin.T1)/1000+"秒");n window.CITY_LIST_PINYIN=CITY_LIST2;n var url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,"var CITY_LIST_PINYIN="n ,JSON.stringify(CITY_LIST2,null,"\t")n ]n ,{"type":"text/plain"})n );n var downA=document.createElement("A");n downA.innerHTML="下载查询好城市的文件";n downA.href=url;n downA.download="data-pinyin.txt";n document.body.appendChild(downA);n downA.click();n};nnvar RunPinYin=function(){n RunPinYin.T1=Date.now();n QueryPinYin(ViewDown);n};nnn//立即执行代码nif(window.CITY_LIST){n if(!PageToken){n PageToken=prompt("Token");n };n RunPinYin();n}else{n console.error("data.txt未输入");n};nu003c/codeu003eu003c/preu003enu003cpu003e这时候会提示输入token,把刚才找到的token粘贴进去,然后就开始工作了:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 481px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 68.67999999999999%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1076" data-height="739"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-2a5c52becbcc8424.png" data-original-width="1076" data-original-height="739" data-original-format="" data-original-filesize="252673"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003cpu003e还挺快的,2分钟多点全部翻译完成。u003c/pu003enu003ch2u003e3. 格式化成CSVu003c/h2u003enu003cpu003e数据全部有了,导出成比较正常使用的格式,CSV最好了。这个导出比较简单,任意网页控制台把第二部保存的文件打开,复制数据到任意网页控制台,然后输入以下代码:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n格式并且输出为csvnn先加载数据n 控制台输入data-pinyin.txtnn导入数据库:n 文件格式Unicode,文字为字符流n 检查id重复项,修正idn 转入area_cityn 增加港澳台、海外两个省级n 检查名称重复项,修正名称n select * from area_city where len(name)=1n select pid,name,count(*) from area_city group by pid,name having COUNT(*)u0026gt;1n*/nnvar FixTrim=function(name){n return name.replace(/^\s+|\s+/g,"");n};nfunction CSVName(name){n return '"'+FixTrim(name).replace(/"/g,'""')+'"';n};nnvar CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"];nfor(var i=0;iu0026lt;CITY_LIST_PINYIN.length;i++){n var o=CITY_LIST_PINYIN[i];n var pf="";n var pinyin=FixTrim(o.P).toLowerCase();n var ps=pinyin.split(" ");n for(var j=0;ju0026lt;ps.lengthu0026amp;u0026amp;ju0026lt;3;j++){n pf+=ps[j].substr(0,j==0?2:1);n };n n CITY_CSV.push(o.id+","+o.pid+","+o.deep+","+CSVName(o.name)n +","+CSVName(pf)+","+CSVName(o.P)n +","+CSVName(o.ext_id+"")+","+CSVName(o.ext_name));n};nnvar url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,CITY_CSV.join("\n")n ]n ,{"type":"text/plain"})n);nvar downA=document.createElement("A");ndownA.innerHTML="下载查询好城市的文件";ndownA.href=url;ndownA.download="ok_data.csv";ndocument.body.appendChild(downA);ndownA.click();nu003c/codeu003eu003c/preu003enu003cpu003eOK,数据全部搞完:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 624px; max-height: 418px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 66.99000000000001%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="624" data-height="418"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-a217058081114bfe.png" data-original-width="624" data-original-height="418" data-original-format="" data-original-filesize="23305"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch1u003e数据问题u003c/h1u003enu003colu003enu003cliu003eu003cpu003eid编号和国家统计局的编号基本一致,方便以后更新。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003eid重复项目前是没有(已优化过了),不过以前采集后直接对统计局的编号进行简单缩短后会有重复现象(算是精度丢失)。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003e拼音前缀取的是第一个字前两个字母和后两个字首字母,意图是让第一个字相同名称的尽量能排序在一起。排序1:u003ccodeu003e黑龙江helj、湖北hub、湖南hunu003c/codeu003e;排序2:u003ccodeu003e湖北hb、黑龙江hlj、湖南hnu003c/codeu003e,排序一胜出。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003e因为区名字是直接去掉市、区后缀,存在那么几对名字变得完全一样的,需要手动吧市区后缀加上,不然会产生小问题。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003eu003cdelu003e最终数据已上传了一份到CSDN,含所有代码和本文档:u003ccodeu003ehttp://download.csdn.net/download/xiangyuecn/10226964u003c/codeu003eu003c/delu003e,u003ca href="https://links.jianshu.com/go?to=https%3A%2F%2Fgithub.com%2Fxiangyuecn%2FAreaCity-JsSpider-StatsGov%2Freleases" target="_blank" rel="nofollow"u003eGitHub下载最新数据u003c/au003eu003c/pu003eu003c/liu003enu003c/olu003en","voted_down":false,"rewardable":true,"show_paid_comment_tips":false,"share_image_url":"http://upload-images.jianshu.io/upload_images/2152669-5199f96a123a2c9b.png","slug":"fb5bba916924","user":{"liked_by_user":false,"following_count":36,"gender":1,"avatar_widget":null,"slug":"458a7e3fd0e2","intro":"","likes_count":87,"nickname":"高坚果兄弟","badges":[],"total_fp_amount":"8114874055201809467","wordage":49480,"avatar":"https://upload.jianshu.io/users/upload_avatars/2152669/5460957dbc68.jpg","id":2152669,"liked_user":false},"likes_count":0,"paid_type":"free","show_ads":true,"paid_content_accessible":false,"total_fp_amount":"0","trial_open":false,"reprintable":true,"bookmarked":false,"wordage":1390,"featured_comments_count":0,"downvotes_count":0,"wangxin_trial_open":null,"guideShow":{"new_money_time_reward_type":5,"audit_user_nickname_spliter":0,"pc_note_bottom_btn":1,"pc_like_author_guidance":0,"audit_user_background_image_spliter":0,"audit_note_spliter":0,"launch_tab":1,"include_post":0,"pc_login_guidance":1,"audit_comment_spliter":1,"pc_note_bottom_qrcode":1,"audit_user_avatar_spliter":0,"flow_ad_check_detail_button_style":0,"audit_collection_spliter":0,"pc_top_lottery_guidance":1,"subscription_guide_entry":1,"creation_muti_function_on":1,"audit_user_spliter":1,"pc_note_popup":0},"commentable":true,"total_rewards_count":0,"id":23239084,"notebook":{"name":""},"description":"本文更新(移步查阅):19-04-15 新采集了2018的省市区三级坐标和行政区域边界19-03-22 采集了2018的城市数据18-11-28 采集了2017的城市数据数据...","first_shared_at":1517035911,"views_count":567,"notebook_id":21660119},"baseList":{"likeList":[],"rewardList":[]},"status":"success","statusCode":0},"user":{"isLogin":false,"userInfo":{}},"comments":{"list":[],"featuredList":[]}},"initialProps":{"pageProps":{"query":{"slug":"fb5bba916924"}},"localeData":{"common":{"jianshu":"简书","diamond":"简书钻","totalAssets":"总资产{num}","diamondValue":" (约{num}元)","login":"登录","logout":"注销","register":"注册","on":"开","off":"关","follow":"关注","followBook":"关注连载","following":"已关注","cancelFollow":"取消关注","publish":"发布","wordage":"字数","audio":"音频","read":"阅读","reward":"赞赏","zan":"赞","comment":"评论","expand":"展开","prevPage":"上一页","nextPage":"下一页","floor":"楼","confirm":"确定","delete":"删除","report":"举报","fontSong":"宋体","fontBlack":"黑体","chs":"简体","cht":"繁体","jianChat":"简信","postRequest":"投稿请求","likeAndZan":"喜欢和赞","rewardAndPay":"赞赏和付费","home":"我的主页","markedNotes":"收藏的文章","likedNotes":"喜欢的文章","paidThings":"已购内容","wallet":"我的钱包","setting":"设置","feedback":"帮助与反馈","loading":"加载中...","needLogin":"请登录后进行操作","trialing":"文章正在审核中...","reprintTip":"禁止转载,如需转载请通过简信或评论联系作者。"},"error":{"rewardSelf":"无法打赏自己的文章哟~"},"message":{"paidNoteTip":"付费购买后才可以参与评论哦","CommentDisableTip":"作者关闭了评论功能","contentCanNotEmptyTip":"回复内容不能为空","addComment":"评论发布成功","deleteComment":"评论删除成功","likeComment":"评论点赞成功","setReadMode":"阅读模式设置成功","setFontType":"字体设置成功","setLocale":"显示语言设置成功","follow":"关注成功","cancelFollow":"取消关注成功","copySuccess":"复制代码成功"},"header":{"homePage":"首页","download":"下载APP","discover":"发现","message":"消息","reward":"赞赏支持","editNote":"编辑文章","writeNote":"写文章"},"note":{},"noteMeta":{"lastModified":"最后编辑于 ","wordage":"字数 {num}","viewsCount":"阅读 {num}"},"divider":{"selfText":"以下内容为付费内容,定价 ¥{price}","paidText":"已付费,可查看以下内容","notPaidText":"还有 {percent} 的精彩内容","modify":"点击修改"},"paidPanel":{"buyNote":"支付 ¥{price} 继续阅读","buyBook":"立即拿下 ¥{price}","freeTitle":"该作品为付费连载","freeText":"购买即可永久获取连载内的所有内容,包括将来更新的内容","paidTitle":"还没看够?拿下整部连载!","paidText":"永久获得连载内的所有内容, 包括将来更新的内容"},"book":{"last":"已是最后","lookCatalog":"查看连载目录","header":"文章来自以下连载"},"action":{"like":"{num}人点赞","collection":"收入专题","report":"举报文章"},"comment":{"allComments":"全部评论","featuredComments":"精彩评论","closed":"评论已关闭","close":"关闭评论","open":"打开评论","desc":"按时间倒序","asc":"按时间正序","disableText1":"用户已关闭评论,","disableText2":"与Ta简信交流","placeholder":"写下你的评论...","publish":"发表","create":" 添加新评论","reply":" 回复","restComments":"还有{num}条评论,","expandImage":"展开剩余{num}张图","deleteText":"确定要删除评论么?"},"collection":{"title":"被以下专题收入,发现更多相似内容","putToMyCollection":"收入我的专题"},"seoList":{"title":"推荐阅读","more":"更多精彩内容"},"sideList":{"title":"推荐阅读"},"wxShareModal":{"desc":"打开微信“扫一扫”,打开网页后点击屏幕右上角分享按钮"},"bookChapterModal":{"try":"试读","toggle":"切换顺序"},"collectionModal":{"title":"收入到我管理的专题","search":"搜索我管理的专题","newCollection":"新建专题","create":"创建","nothingFound":"未找到相关专题","loadMore":"展开查看更多"},"contributeModal":{"search":"搜索专题投稿","newCollection":"新建专题","addNewOne":"去新建一个","nothingFound":"未找到相关专题","loadMore":"展开查看更多","managed":"我管理的专题","recommend":"推荐专题"},"QRCodeShow":{"payTitle":"微信扫码支付","payText":"支付金额"},"rewardModal":{"title":"给作者送糖","custom":"自定义","placeholder":"给Ta留言...","choose":"选择支付方式","balance":"简书余额","tooltip":"网站该功能暂时下线,如需使用,请到简书App操作","confirm":"确认支付","success":"赞赏成功"},"payModal":{"payBook":"购买连载","payNote":"购买文章","promotion":"优惠券","promotionFetching":"优惠券获取中...","noPromotion":"无可用优惠券","promotionNum":"{num}张可用","noUsePromotion":"不使用优惠券","validPromotion":"可用优惠券","invalidPromotion":"不可用优惠券","total":"支付总额","tip1":"· 你将购买的商品为虚拟内容服务,购买后不支持退订、转让、退换,请斟酌确认。","tip2":"· 购买后可在“已购内容”中查看和使用。","success":"购买成功"},"reportModal":{"ad":"广告及垃圾信息","plagiarism":"抄袭或未授权转载","placeholder":"写下举报的详情情况(选填)","success":"举报成功"},"guidModal":{"modalAText":"相似文章推荐","subText":"下载简书APP,浏览更多相似文章","btnAText":"先不下载,下次再说","followOkText":"关注作者成功!","followTextTip":"下载简书APP,作者更多精彩内容更新及时提醒!","followBtn":"下次再说","downloadTipText":"更多精彩内容下载简书APP","footerDownLoadText":"下载简书APP","modabTitle":"免费送你2次抽奖机会","modalbTip":"你有很大概率抽取AirPods Pro","modalbFooterTip":"下载简书APP,天天参与抽大奖","modalReward":"抽奖","scanQrtip":"扫码下载简书APP","downloadAppText":"下载简书APP,随时随地发现和创作内容","redText":"阅读","likesText":"赞","downLoadLeft":"下载App"}},"currentLocale":"zh-CN","asPath":"/p/fb5bba916924"}},"page":"/p/[slug]","query":{"slug":"fb5bba916924"},"buildId":"ZJP8vj8XvQ-o-3nKSjb0s","assetPrefix":"https://cdn2.jianshu.io/shakespeare"}
文章来源于互联网,如有雷同请联系站长删除:从统计局采集最新的省市区县数据,纯js