本文更新(移步查阅):
19-04-15 新采集了2018的省市区三级坐标和行政区域边界
19-03-22 采集了2018的城市数据
18-11-28 采集了2017的城市数据

数据下载 GitHub:https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases
相关更新情况,请查阅我发布的其他文章,本文以下内容不再更新。

18-01-28早上6:30的火车,从三亚回老家,票难买啊。好激动~
声明:文中涉及到的数据和第三方接口、url仅供学习使用,请勿它用~

这几天都在磨着搭建本地测试环境,看到省市区数据表里面是空的,想着以前的老数据还是13年采集的,含省市区县4级数据共4.8万条,时间久了,使用过程中发现有些新的城市名称数据库中没有,县级数据从来就没有用到过,想着还是重新采集一份。

新采集的省市区数据有3589条,这次并没有把县级数据采过来,需要的时候再添加也挺好。

数据来源

国家统计局统计标准《2016年统计用区划代码和城乡划分代码(截止2016年07月31日)》,这个是2017-05-16发布的,当前是最新的。

数据采集

对于数据采集,根据工作需要,对于一些小的数据采集功能有些接触。因为对html和js熟些,很早以前就用IE浏览器对本地html文件支持任意跨域ajax请求数据、和支持读写Excel文件,就直接写一个html文件作为采集工具给别人使用,批量查询人员资料、考试结果什么的功能。所以采集省市区数据主要用的js。

1. 抓取原始数据

打开网页http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html省份的数据就有了,进入市级页面,然后进入区级页面,还可以进入县级页面。整个流程地址结构非常简单,数据格式也很好提取。

进入网页后打开浏览器控制台,执行下面代码,这段代码仅仅包含采集省市区的,把县级的阉割掉了,13年的老代码有县级的。很早以前写的代码,风格有点丑,不过能能正常使用就是好的,这个采集是“单线程的”,因为这些数据少,速度并不慢:

/*
获取城市名称http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
*/
(function(){
if(!window.URL){
    throw new Error("浏览器版本太低");
};
function ajax(url,True,False){
    var ajax=new XMLHttpRequest();
    ajax.timeout=1000;
    ajax.open("GET",url);
    ajax.onreadystatechange=function(){
        if(ajax.readyState==4){
            if(ajax.status==200){
                True(ajax.responseText);
            }else{
                False();
            }
        }
    }
    ajax.send();
}
function msg(){
    console.log.apply(console, arguments);
}

function cityClass(name,url,code){
    this.name=name;
    this.url=url;
    this.code=code;
    this.child=[];
    this.tryCount=0;
}
cityClass.prototype={
    getValue:function(){
        var obj={name:this.name,code:this.code,child:[]};
        for(var i=0;i(.+?)
"))+1){ reg.lastIndex=idx; while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){ url=path+"/"+url; } var name=match[2]; DATA.push(new cityClass(name,url,0)); } True(); }else{ msg("未发现省份数据"); } },function(){ msg("读取省份列表出错","程序终止"); }); } function load_shen(True, False){ var city=DATA[JD.shen]; city.tryCount++; if(city.tryCount>3){ msg("读取省份["+city.name+"]超过3次"); False(); return; }; function get(){ msg("读取省份["+city.name+"]", getJD()); save(); city.child[JD.si].tryCount=0; load_si(function(){ JD.shen++; if(JD.shen>=DATA.length){ JD.shen=0; True(); return; }; DATA[JD.shen].tryCount=0; load_shen(True,False); },function(){ False(); }); } if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/.+?href='(.+?)'>(.+?)(.+?); var match; while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){ url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var code=match[2]; var name=match[3]; city.child.push(new cityClass(name,url,code)); } JD.si=0; get(); },function(){ load_shen(True,False); }); }; } function load_si(True,False){ var shen=DATA[JD.shen]; var city=shen.child[JD.si]; city.tryCount++; if(city.tryCount>3){ msg("读取城市["+city.name+"]超过3次"); False(); return; }; function get(){ msg("___读取城市["+city.name+"]", getJD()); city.child[JD.xian].tryCount=0; JD.si++; if(JD.si>=shen.child.length){ JD.si=0; True(); return; }; shen.child[JD.si].tryCount=0; load_si(True,False); } if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/class='(?:countytr|towntr)'.+?/ig; var match; while(match=reg.exec(text)){ var reg2=/class='(?:countytr|towntr)'.+?(?:(.+?)(.+?)(.+?)(.+?) 采集截图:

2. 处理数据和拼音标注

数据处理就简单些了,比如编号格式化、名称格式化等。 拼音标注:这个需要找一个接口对文字进行拼音翻译,只有一个要求:重庆能正常的翻译成chong qing即可,翻译成zhong qing的就low了。满足这个条件,百度上搜索到的翻译小网站80%就被干掉了。 浏览器中打开找到的翻译接口http://www.qqxiuzi.cn/zh/pinyin/,截止到目前是能正常调用的,因为要用ajax请求数据,在页面里面就没有跨域的问题,查看网页源码,把token值记录下来,这个网站翻译请求需要带这个token,注意~刷新页面要重新获取:
拼音这个因为数据量比较多,采用了“4个线程”采集,先把第一步采集到的文件打开,把数据复制到打开的翻译网站浏览器控制台里面执行(相当于把数据导入),然后执行下面代码:
/*
拼音翻译
http://www.qqxiuzi.cn/zh/pinyin/

http://www.qqxiuzi.cn/zh/pinyin/show.php
POST
t=汉字&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token=页面token请求一次获取

先加载数据
    控制台输入data.txt
*/
window.PageToken=window.PageToken||"";
var FixTrim=function(name){
    return name.replace(/^s+|s+/g,"");
};
var CITY_LIST2;
var QueryPinYin=function(end){
    if(!window.PageToken){
        console.error("Need PageToken");
        return;
    };
    var ids=[];
       var fixCode=function(o){
        if(o.deep==0){
            o.orgCode="0";
        }else{
            o.orgCode=o.code;
            if(o.deep==1){
                o.code=o.code.substr(o,4);
            }else{
                o.code=o.code.replace(/(000000|000)/g,"");//有少部分区多3位
            };
        };
        return o;
    };
    var fix=function(o,p){
        var name=o.name;
        if(o.deep==0){
            name=name.replace(/(市|省|(维吾尔|壮族|回族)?自治区)/ig,"");
        }else if(o.deep==1){
            if(name=="市辖区"){
                name=p.o2.name;
            }else if(/行政区划/ig.test(name)){
                name="直辖市";
            }else if(name.length>2){
                name=name.replace(/市/ig,"");
            };
        }else{
            if(name.length>2 && name!="市辖区"
                && !/(自治.|地区|矿区)/.test(name)){//直接排除会有同名的
                name=name.replace(/(市|区|县|镇|管委会|街道办事处)/ig,"");
            };
        };
        var o2={
            name:name
            ,ext_name:o.name
            ,id:+o.code||0
            ,ext_id:+o.orgCode
            ,pid:p&&+p.code||0
            ,deep:o.deep
        };
        o.o2=o2;
        return o2;
    };
    for(var i=0;i=ids.length){
            thread--;
            if(thread==0){
                end();
            };
            return;
        };
               var idx_=idx;
        var id=ids[idx];
        if(id.P){
            stack++;
            if(stack%50==0){
                setTimeout(function(){run()});
            }else{
                run(stack);
            };
            return;
        };
               var name=id.name;
        var tryCount=0;
        var tryLoad=function(){.ajax({
                url:"/zh/pinyin/show.php"
                ,data:"t="+encodeURIComponent(name)+"&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token="+PageToken
                ,type:"POST"
                ,dataType:"text"
                ,timeout:1000
                ,error:function(e){
                    if(tryCount>3){
                        console.error("--QueryPinYin error--"+e);
                        run();
                        return;
                    };
                    tryCount++;
                    tryLoad();
                }
                ,success:function(txt){
                    txt=FixTrim(txt.replace(//g,"").replace(/s+/g," "));
                    id.P=txt;
                    console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --");
                    run();
                }
            });
        };
        tryLoad();
    };
    
    var thread=4;
    run();
    run();
    run();
    run();
};


var ViewDown=function(){
    console.log("完成:"+(Date.now()-RunPinYin.T1)/1000+"秒");
    window.CITY_LIST_PINYIN=CITY_LIST2;
    var url=URL.createObjectURL(
        new Blob([
            new Uint8Array([0xEF,0xBB,0xBF])
            ,"var CITY_LIST_PINYIN="
            ,JSON.stringify(CITY_LIST2,null,"t")
        ]
        ,{"type":"text/plain"})
    );
    var downA=document.createElement("A");
    downA.innerHTML="下载查询好城市的文件";
    downA.href=url;
    downA.download="data-pinyin.txt";
    document.body.appendChild(downA);
    downA.click();
};

var RunPinYin=function(){
    RunPinYin.T1=Date.now();
    QueryPinYin(ViewDown);
};


//立即执行代码
if(window.CITY_LIST){
    if(!PageToken){
        PageToken=prompt("Token");
    };
    RunPinYin();
}else{
    console.error("data.txt未输入");
};

这时候会提示输入token,把刚才找到的token粘贴进去,然后就开始工作了:

还挺快的,2分钟多点全部翻译完成。

3. 格式化成CSV

数据全部有了,导出成比较正常使用的格式,CSV最好了。这个导出比较简单,任意网页控制台把第二部保存的文件打开,复制数据到任意网页控制台,然后输入以下代码:

/*
格式并且输出为csv

先加载数据
    控制台输入data-pinyin.txt

导入数据库:
    文件格式Unicode,文字为字符流
    检查id重复项,修正id
    转入area_city
    增加港澳台、海外两个省级
    检查名称重复项,修正名称
        select * from area_city where len(name)=1
        select pid,name,count(*) from area_city group by pid,name having COUNT(*)>1
*/

var FixTrim=function(name){
    return name.replace(/^s+|s+$/g,"");
};
function CSVName(name){
    return '"'+FixTrim(name).replace(/"/g,'""')+'"';
};

var CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"];
for(var i=0;i

OK,数据全部搞完:

数据问题

  1. id编号和国家统计局的编号基本一致,方便以后更新。

  2. id重复项目前是没有(已优化过了),不过以前采集后直接对统计局的编号进行简单缩短后会有重复现象(算是精度丢失)。

  3. 拼音前缀取的是第一个字前两个字母和后两个字首字母,意图是让第一个字相同名称的尽量能排序在一起。排序1:黑龙江helj、湖北hub、湖南hun;排序2:湖北hb、黑龙江hlj、湖南hn,排序一胜出。

  4. 因为区名字是直接去掉市、区后缀,存在那么几对名字变得完全一样的,需要手动吧市区后缀加上,不然会产生小问题。

  5. 最终数据已上传了一份到CSDN,含所有代码和本文档:http://download.csdn.net/download/xiangyuecn/10226964,GitHub下载最新数据

推荐阅读更多精彩内容

  • uni-app之定时器
    setInterval()
    爱吃鱼_自由阅读 435评论 1赞 0
  • 利用python爬虫获取全国五级地址
    1、抓取省级地址 2019年数据[http://www.stats.gov.cn/tjsj/tjbz/tjyqhd...
    暮雨朝烟阅读 151评论 0赞 6
  • xlwings-让Excel好用到飞起!
    最近发现一个好用的Python库——xlwings,它可以很容易的使用Python操作Excel,也可以从Exce...
    疯狂大宝贝儿阅读 1,252评论 1赞 18
  • uniapp 操作通信录
    Contacts模块管理系统通讯录,用于可对系统通讯录进行增、删、改、查等操作。通过plus.contacts获取...
    见月荒州阅读 144评论 0赞 0
  • 接口测试工具
    Web接口测试工具---Poster与Postman[http://www.cnblogs.com/fnng/p/...
    曹元_阅读 378评论 0赞 6
评论0
1赞
赞赏
下载App

{"dataManager":"[]","props":{"isServer":true,"initialState":{"global":{"done":false,"artFromType":null,"fontType":"black","modal":{"ContributeModal":false,"RewardListModal":false,"PayModal":false,"CollectionModal":false,"LikeListModal":false,"ReportModal":false,"QRCodeShareModal":false,"BookCatalogModal":false,"RewardModal":false},"ua":{"value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36","isIE11":false,"earlyIE":null,"chrome":"58.0","firefox":null,"safari":null,"isMac":false},"diamondRate":{"displayable":false,"rate":0},"readMode":"day","locale":"zh-CN","seoList":[{"comments_count":1,"public_abbr":"setInterval()","share_image_url":"https://upload-images.jianshu.io/upload_images/8635312-f76af2bab16f2dc2.png","slug":"962f1e1ba77f","user":{"id":8635312,"nickname":"爱吃鱼_自由","slug":"34492e8d1395","avatar":"https://upload.jianshu.io/users/upload_avatars/8635312/9261b2b5-ae06-4946-9a0f-0add546bd5f0.png"},"likes_count":0,"title":"uni-app之定时器","id":80946474,"views_count":435},{"comments_count":0,"public_abbr":"1、抓取省级地址 2019年数据[http://www.stats.gov.cn/tjsj/tjbz/tjyqhd...","share_image_url":"https://upload-images.jianshu.io/upload_images/24894327-952d378c1fec3ec7.png","slug":"95a1b7386598","user":{"id":24894327,"nickname":"暮雨朝烟","slug":"b9bea5ad67fb","avatar":"https://upload.jianshu.io/users/upload_avatars/24894327/35db4c91-7ff0-4599-816b-c878cdbbd241"},"likes_count":6,"title":"利用python爬虫获取全国五级地址","id":78267369,"views_count":151},{"comments_count":1,"public_abbr":"最近发现一个好用的Python库——xlwings,它可以很容易的使用Python操作Excel,也可以从Exce...","share_image_url":"https://upload-images.jianshu.io/upload_images/20543630-9d93dab6403ba08b.png","slug":"8f31dfc33355","user":{"id":20543630,"nickname":"疯狂大宝贝儿","slug":"84aa4767fdf2","avatar":"https://upload.jianshu.io/users/upload_avatars/20543630/b9ee8f12-4c40-4490-8994-b5a0e4a6b2af.jpeg"},"likes_count":18,"title":"xlwings-让Excel好用到飞起!","id":77221871,"views_count":1252},{"comments_count":0,"public_abbr":"Contacts模块管理系统通讯录,用于可对系统通讯录进行增、删、改、查等操作。通过plus.contacts获取...","share_image_url":"","slug":"a10efe33fe1c","user":{"id":7197012,"nickname":"见月荒州","slug":"e42e68857b67","avatar":"https://upload.jianshu.io/users/upload_avatars/7197012/3d4dcd18-e6fa-4567-bf25-28036ee12016"},"likes_count":0,"title":"uniapp 操作通信录","id":79516424,"views_count":144},{"comments_count":0,"public_abbr":"Web接口测试工具---Poster与Postman[http://www.cnblogs.com/fnng/p/...","share_image_url":"https://upload-images.jianshu.io/upload_images/23770617-3347eef11edd08d6.png","slug":"b39a40e4401a","user":{"id":23770617,"nickname":"曹元_","slug":"3bf86d473626","avatar":"https://upload.jianshu.io/users/upload_avatars/23770617/db472dfb-062c-440e-924b-ba8c3348ecde.jpg"},"likes_count":6,"title":"接口测试工具","id":76436630,"views_count":378}]},"note":{"data":{"is_author":false,"last_updated_at":1555373516,"public_title":"从统计局采集最新的省市区县数据,纯js","purchased":false,"liked_note":false,"comments_count":0,"free_content":"u003cblockquoteu003enu003cpu003e本文更新(移步查阅):u003cbru003en19-04-15 u003ca href="https://www.jianshu.com/p/e200899f1e3a" target="_blank"u003e新采集了2018的省市区三级坐标和行政区域边界u003c/au003eu003cbru003en19-03-22 u003ca href="https://www.jianshu.com/p/c3f7ef149ea7" target="_blank"u003e采集了2018的城市数据u003c/au003eu003cbru003en18-11-28 采集了2017的城市数据u003c/pu003enu003cpu003e数据下载 GitHub:u003ca href="https://links.jianshu.com/go?to=https%3A%2F%2Fgithub.com%2Fxiangyuecn%2FAreaCity-JsSpider-StatsGov%2Freleases" target="_blank" rel="nofollow"u003ehttps://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releasesu003c/au003eu003cbru003en相关更新情况,请查阅我发布的其他文章,本文以下内容不再更新。u003c/pu003enu003c/blockquoteu003enu003cblockquoteu003enu003cpu003e18-01-28早上6:30的火车,从三亚回老家,票难买啊。好激动~u003cbru003en声明:文中涉及到的数据和第三方接口、url仅供学习使用,请勿它用~u003c/pu003enu003c/blockquoteu003enu003cpu003e这几天都在磨着搭建本地测试环境,看到省市区数据表里面是空的,想着以前的老数据还是13年采集的,含省市区县4级数据共4.8万条,时间久了,使用过程中发现有些新的城市名称数据库中没有,县级数据从来就没有用到过,想着还是重新采集一份。u003c/pu003enu003cpu003e新采集的省市区数据有3589条,这次并没有把县级数据采过来,需要的时候再添加也挺好。u003c/pu003enu003ch1u003e数据来源u003c/h1u003enu003cpu003e国家统计局统计标准《2016年统计用区划代码和城乡划分代码(截止2016年07月31日)》,这个是2017-05-16发布的,当前是最新的。u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 592px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 57.03%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1038" data-height="592"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-5199f96a123a2c9b.png" data-original-width="1038" data-original-height="592" data-original-format="" data-original-filesize="216051"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch1u003e数据采集u003c/h1u003enu003cpu003e对于数据采集,根据工作需要,对于一些小的数据采集功能有些接触。因为对html和js熟些,很早以前就用IE浏览器对本地html文件支持任意跨域ajax请求数据、和支持读写Excel文件,就直接写一个html文件作为采集工具给别人使用,批量查询人员资料、考试结果什么的功能。所以采集省市区数据主要用的js。u003c/pu003enu003ch2u003e1. 抓取原始数据u003c/h2u003enu003cpu003e打开网页u003ccodeu003ehttp://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.htmlu003c/codeu003e省份的数据就有了,进入市级页面,然后进入区级页面,还可以进入县级页面。整个流程地址结构非常简单,数据格式也很好提取。u003c/pu003enu003cpu003e进入网页后打开浏览器控制台,执行下面代码,这段代码仅仅包含采集省市区的,把县级的阉割掉了,13年的老代码有县级的。很早以前写的代码,风格有点丑,不过能能正常使用就是好的,这个采集是“单线程的”,因为这些数据少,速度并不慢:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n获取城市名称http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.htmln*/n(function(){nif(!window.URL){n throw new Error("浏览器版本太低");n};nfunction ajax(url,True,False){n var ajax=new XMLHttpRequest();n ajax.timeout=1000;n ajax.open("GET",url);n ajax.onreadystatechange=function(){n if(ajax.readyState==4){n if(ajax.status==200){n True(ajax.responseText);n }else{n False();n }n }n }n ajax.send();n}nfunction msg(){n console.log.apply(console, arguments);n}nnfunction cityClass(name,url,code){n this.name=name;n this.url=url;n this.code=code;n this.child=[];n this.tryCount=0;n}ncityClass.prototype={n getValue:function(){n var obj={name:this.name,code:this.code,child:[]};n for(var i=0;iu0026lt;this.child.length;i++){n obj.child.push(this.child[i].getValue());n }n return obj;n }n}nnfunction load_all(True){n var path="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016";n ajax(path+"/index.html",function(text){n var reg=/href='(.+?)'u0026gt;(.+?)u0026lt;br/ig,match;n var idx;n if((idx=text.indexOf("u0026lt;tr class='provincetr'u0026gt;"))+1){n reg.lastIndex=idx;n while(match=reg.exec(text)){n var url=match[1];n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=path+"/"+url;n }n var name=match[2];n DATA.push(new cityClass(name,url,0));n }n True();n }else{n msg("未发现省份数据");n }n },function(){n msg("读取省份列表出错","程序终止");n });n}nfunction load_shen(True, False){n var city=DATA[JD.shen];n city.tryCount++;n if(city.tryCountu0026gt;3){n msg("读取省份["+city.name+"]超过3次");n False();n return;n };n n function get(){n msg("读取省份["+city.name+"]", getJD());n save();n n city.child[JD.si].tryCount=0;n load_si(function(){n JD.shen++;n if(JD.shenu0026gt;=DATA.length){n JD.shen=0;n True();n return;n };n DATA[JD.shen].tryCount=0;n n load_shen(True,False);n },function(){n False();n });n }n n if(city.child.length){n get();n }else{n ajax(city.url,function(text){n var reg=/u0026lt;tr class='citytr'u0026gt;.+?href='(.+?)'u0026gt;(.+?)u0026lt;.+?'u0026gt;(.+?)u0026lt;/ig;n var match;n while(match=reg.exec(text)){n var url=match[1];n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url;n }n var code=match[2];n var name=match[3];n city.child.push(new cityClass(name,url,code));n }n n JD.si=0;n get();n },function(){n load_shen(True,False);n });n };n}nnfunction load_si(True,False){n var shen=DATA[JD.shen];n var city=shen.child[JD.si];n city.tryCount++;n if(city.tryCountu0026gt;3){n msg("读取城市["+city.name+"]超过3次");n False();n return;n };n n n function get(){n msg("___读取城市["+city.name+"]", getJD());n n city.child[JD.xian].tryCount=0;n JD.si++;n if(JD.siu0026gt;=shen.child.length){n JD.si=0;n True();n return;n };n shen.child[JD.si].tryCount=0;n n load_si(True,False);n }n n if(city.child.length){n get();n }else{n ajax(city.url,function(text){n var reg=/class='(?:countytr|towntr)'.+?u0026lt;\/tru0026gt;/ig;n var match;n while(match=reg.exec(text)){n var reg2=/class='(?:countytr|towntr)'.+?(?:u0026lt;tdu0026gt;u0026lt;a href='(.+?)'u0026gt;(.+?)u0026lt;.+?'u0026gt;(.+?)u0026lt;|u0026lt;tdu0026gt;(.+?)u0026lt;.+?u0026lt;tdu0026gt;(.+?)u0026lt;)/ig;n var match2;n if(match2=reg2.exec(match[0])){n var url=match2[1]||"";n if(url.indexOf("//")==-1 u0026amp;u0026amp; url.indexOf("/")!=0){n url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url;n }n var code=match2[2]||match2[4];n var name=match2[3]||match2[5];n city.child.push(new cityClass(name,url,code));n }else{n msg("未知城市模式:");n msg(city.url);n msg(match[0]);n throw new Error("end");n }n }n n JD.xian=0;n get();n },function(){n load_si(True,False);n });n };n}nnnfunction getJD(){n var str="省:"+(JD.shen+1)+"/"+DATA.length;n var shen=DATA[JD.shen];n if(shen){n str+=" 市:"+(JD.si+1)+"/"+shen.child.length;n var si=shen.child[JD.si];n if(si){n str+=" 县:"+(JD.xian+1)+"/"+si.child.length;n }else{n str+=" 县:"+JD.xian;n }n }else{n str+=" 市:"+JD.si+" 县:"+JD.xian;n }n return str;n}nfunction save(){n n}nnvar DATA=[];nvar JD;nwindow.RunLoad=function(shen,si,xian){n RunLoad.T1=Date.now();n JD={n shen:shen||0n ,si:si||0n ,xian:xian||0n }n n function get(){n DATA[JD.shen].tryCount=0;n load_shen(function(){n console.log("完成:"+(Date.now()-RunLoad.T1)/1000+"秒");n save();n n var data=[];n for(var i=0;iu0026lt;DATA.length;i++){n data.push(DATA[i].getValue());n }n n var url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,"var CITY_LIST="n ,JSON.stringify(data,null,"\t")n ]n ,{"type":"text/plain"})n );n var downA=document.createElement("A");n downA.innerHTML="下载查询好城市的文件";n downA.href=url;n downA.download="data.txt";n document.body.appendChild(downA);n downA.click();n n msg("--完成--");n },function(){n save();n msg("当前进度:", getJD());n });n }n n var data=localStorage["load_data"];n if(data){n DATA=JSON.parse(data);n get();n }else{n load_all(get);n }n}n})();//@ sourceURL=console.jsnnn//立即执行代码nRunLoad()nu003c/codeu003eu003c/preu003enu003cpu003e采集截图:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 476px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 67.86999999999999%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1083" data-height="735"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-d67da80d59b94f3e.png" data-original-width="1083" data-original-height="735" data-original-format="" data-original-filesize="369795"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch2u003e2. 处理数据和拼音标注u003c/h2u003enu003cpu003e数据处理就简单些了,比如编号格式化、名称格式化等。u003c/pu003enu003cpu003e拼音标注:这个需要找一个接口对文字进行拼音翻译,只有一个要求:重庆能正常的翻译成chong qing即可,翻译成zhong qing的就low了。满足这个条件,百度上搜索到的翻译小网站80%就被干掉了。u003c/pu003enu003cpu003e浏览器中打开找到的翻译接口u003ccodeu003ehttp://www.qqxiuzi.cn/zh/pinyin/u003c/codeu003e,截止到目前是能正常调用的,因为要用ajax请求数据,在页面里面就没有跨域的问题,查看网页源码,把token值记录下来,这个网站翻译请求需要带这个token,注意~刷新页面要重新获取:u003cbru003enu003c/pu003eu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 464px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 44.57%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1041" data-height="464"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-3569fb95be67eef8.png" data-original-width="1041" data-original-height="464" data-original-format="" data-original-filesize="81218"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003eu003cpu003eu003c/pu003enu003cpu003e拼音这个因为数据量比较多,采用了“4个线程”采集,先把第一步采集到的文件打开,把数据复制到打开的翻译网站浏览器控制台里面执行(相当于把数据导入),然后执行下面代码:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n拼音翻译nhttp://www.qqxiuzi.cn/zh/pinyin/nnhttp://www.qqxiuzi.cn/zh/pinyin/show.phpnPOSTnt=汉字u0026amp;d=1u0026amp;s=nullu0026amp;k=1u0026amp;b=nullu0026amp;h=nullu0026amp;u=nullu0026amp;v=1u0026amp;y=nullu0026amp;z=nullu0026amp;token=页面token请求一次获取nn先加载数据n 控制台输入data.txtn*/nwindow.PageToken=window.PageToken||"";nvar FixTrim=function(name){n return name.replace(/^\s+|\s+/g,"");n};nvar CITY_LIST2;nvar QueryPinYin=function(end){n if(!window.PageToken){n console.error("Need PageToken");n return;n };n var ids=[];n n var fixCode=function(o){n if(o.deep==0){n o.orgCode="0";n }else{n o.orgCode=o.code;n if(o.deep==1){n o.code=o.code.substr(o,4);n }else{n o.code=o.code.replace(/(000000|000)/g,"");//有少部分区多3位n };n };n return o;n };n var fix=function(o,p){n var name=o.name;n if(o.deep==0){n name=name.replace(/(市|省|(维吾尔|壮族|回族)?自治区)/ig,"");n }else if(o.deep==1){n if(name=="市辖区"){n name=p.o2.name;n }else if(/行政区划/ig.test(name)){n name="直辖市";n }else if(name.lengthu0026gt;2){n name=name.replace(/市/ig,"");n };n }else{n if(name.lengthu0026gt;2 u0026amp;u0026amp; name!="市辖区"n u0026amp;u0026amp; !/(自治.|地区|矿区)/.test(name)){//直接排除会有同名的n name=name.replace(/(市|区|县|镇|管委会|街道办事处)/ig,"");n };n };n var o2={n name:namen ,ext_name:o.namen ,id:+o.code||0n ,ext_id:+o.orgCoden ,pid:pu0026amp;u0026amp;+p.code||0n ,deep:o.deepn };n o.o2=o2;n return o2;n };n for(var i=0;iu0026lt;CITY_LIST.length;i++){n var shen=CITY_LIST[i];n shen.deep=0;n for(var i2=0;i2u0026lt;shen.child.length;i2++){n var si=shen.child[i2];n if(!shen.code){n shen.code=si.code.substr(0,2);n ids.push(fix(fixCode(shen)));n };n si.deep=1;n ids.push(fix(fixCode(si),shen));n n n for(var i3=0;i3u0026lt;si.child.length;i3++){n var qu=si.child[i3];n qu.deep=2;n ids.push(fix(fixCode(qu),si));n };n };n };n CITY_LIST2=ids;n //console.log(JSON.stringify(ids,null,"\t"))n //return;n n var idx=-1;n var run=function(stack){n stack=+stack||0;n idx++;n if(idxu0026gt;=ids.length){n thread--;n if(thread==0){n end();n };n return;n };n n var idx_=idx;n var id=ids[idx];n if(id.P){n stack++;n if(stack%50==0){n setTimeout(function(){run()});n }else{n run(stack);n };n return;n };n n var name=id.name;n var tryCount=0;n var tryLoad=function(){n .ajax({n url:"/zh/pinyin/show.php"n ,data:"t="+encodeURIComponent(name)+"u0026amp;d=1u0026amp;s=nullu0026amp;k=1u0026amp;b=nullu0026amp;h=nullu0026amp;u=nullu0026amp;v=1u0026amp;y=nullu0026amp;z=nullu0026amp;token="+PageTokenn ,type:"POST"n ,dataType:"text"n ,timeout:1000n ,error:function(e){n if(tryCountu0026gt;3){n console.error("--QueryPinYin error--"+e);n run();n return;n };n tryCount++;n tryLoad();n }n ,success:function(txt){n txt=FixTrim(txt.replace(/u0026lt;.+?u0026gt;/g,"").replace(/\s+/g," "));n id.P=txt;n console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --");n run();n }n });n };n tryLoad();n };n n var thread=4;n run();n run();n run();n run();n};nnnvar ViewDown=function(){n console.log("完成:"+(Date.now()-RunPinYin.T1)/1000+"秒");n window.CITY_LIST_PINYIN=CITY_LIST2;n var url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,"var CITY_LIST_PINYIN="n ,JSON.stringify(CITY_LIST2,null,"\t")n ]n ,{"type":"text/plain"})n );n var downA=document.createElement("A");n downA.innerHTML="下载查询好城市的文件";n downA.href=url;n downA.download="data-pinyin.txt";n document.body.appendChild(downA);n downA.click();n};nnvar RunPinYin=function(){n RunPinYin.T1=Date.now();n QueryPinYin(ViewDown);n};nnn//立即执行代码nif(window.CITY_LIST){n if(!PageToken){n PageToken=prompt("Token");n };n RunPinYin();n}else{n console.error("data.txt未输入");n};nu003c/codeu003eu003c/preu003enu003cpu003e这时候会提示输入token,把刚才找到的token粘贴进去,然后就开始工作了:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 700px; max-height: 481px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 68.67999999999999%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="1076" data-height="739"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-2a5c52becbcc8424.png" data-original-width="1076" data-original-height="739" data-original-format="" data-original-filesize="252673"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003cpu003e还挺快的,2分钟多点全部翻译完成。u003c/pu003enu003ch2u003e3. 格式化成CSVu003c/h2u003enu003cpu003e数据全部有了,导出成比较正常使用的格式,CSV最好了。这个导出比较简单,任意网页控制台把第二部保存的文件打开,复制数据到任意网页控制台,然后输入以下代码:u003c/pu003enu003cpreu003eu003ccode class="javascript"u003e/*n格式并且输出为csvnn先加载数据n 控制台输入data-pinyin.txtnn导入数据库:n 文件格式Unicode,文字为字符流n 检查id重复项,修正idn 转入area_cityn 增加港澳台、海外两个省级n 检查名称重复项,修正名称n select * from area_city where len(name)=1n select pid,name,count(*) from area_city group by pid,name having COUNT(*)u0026gt;1n*/nnvar FixTrim=function(name){n return name.replace(/^\s+|\s+/g,"");n};nfunction CSVName(name){n return '"'+FixTrim(name).replace(/"/g,'""')+'"';n};nnvar CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"];nfor(var i=0;iu0026lt;CITY_LIST_PINYIN.length;i++){n var o=CITY_LIST_PINYIN[i];n var pf="";n var pinyin=FixTrim(o.P).toLowerCase();n var ps=pinyin.split(" ");n for(var j=0;ju0026lt;ps.lengthu0026amp;u0026amp;ju0026lt;3;j++){n pf+=ps[j].substr(0,j==0?2:1);n };n n CITY_CSV.push(o.id+","+o.pid+","+o.deep+","+CSVName(o.name)n +","+CSVName(pf)+","+CSVName(o.P)n +","+CSVName(o.ext_id+"")+","+CSVName(o.ext_name));n};nnvar url=URL.createObjectURL(n new Blob([n new Uint8Array([0xEF,0xBB,0xBF])n ,CITY_CSV.join("\n")n ]n ,{"type":"text/plain"})n);nvar downA=document.createElement("A");ndownA.innerHTML="下载查询好城市的文件";ndownA.href=url;ndownA.download="ok_data.csv";ndocument.body.appendChild(downA);ndownA.click();nu003c/codeu003eu003c/preu003enu003cpu003eOK,数据全部搞完:u003c/pu003enu003cbru003enu003cdiv class="image-package"u003enu003cdiv class="image-container" style="max-width: 624px; max-height: 418px;"u003enu003cdiv class="image-container-fill" style="padding-bottom: 66.99000000000001%;"u003eu003c/divu003enu003cdiv class="image-view" data-width="624" data-height="418"u003eu003cimg data-original-src="//upload-images.jianshu.io/upload_images/2152669-a217058081114bfe.png" data-original-width="624" data-original-height="418" data-original-format="" data-original-filesize="23305"u003eu003c/divu003enu003c/divu003enu003cdiv class="image-caption"u003eu003c/divu003enu003c/divu003enu003ch1u003e数据问题u003c/h1u003enu003colu003enu003cliu003eu003cpu003eid编号和国家统计局的编号基本一致,方便以后更新。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003eid重复项目前是没有(已优化过了),不过以前采集后直接对统计局的编号进行简单缩短后会有重复现象(算是精度丢失)。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003e拼音前缀取的是第一个字前两个字母和后两个字首字母,意图是让第一个字相同名称的尽量能排序在一起。排序1:u003ccodeu003e黑龙江helj、湖北hub、湖南hunu003c/codeu003e;排序2:u003ccodeu003e湖北hb、黑龙江hlj、湖南hnu003c/codeu003e,排序一胜出。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003e因为区名字是直接去掉市、区后缀,存在那么几对名字变得完全一样的,需要手动吧市区后缀加上,不然会产生小问题。u003c/pu003eu003c/liu003enu003cliu003eu003cpu003eu003cdelu003e最终数据已上传了一份到CSDN,含所有代码和本文档:u003ccodeu003ehttp://download.csdn.net/download/xiangyuecn/10226964u003c/codeu003eu003c/delu003e,u003ca href="https://links.jianshu.com/go?to=https%3A%2F%2Fgithub.com%2Fxiangyuecn%2FAreaCity-JsSpider-StatsGov%2Freleases" target="_blank" rel="nofollow"u003eGitHub下载最新数据u003c/au003eu003c/pu003eu003c/liu003enu003c/olu003en","voted_down":false,"rewardable":true,"show_paid_comment_tips":false,"share_image_url":"http://upload-images.jianshu.io/upload_images/2152669-5199f96a123a2c9b.png","slug":"fb5bba916924","user":{"liked_by_user":false,"following_count":36,"gender":1,"avatar_widget":null,"slug":"458a7e3fd0e2","intro":"","likes_count":87,"nickname":"高坚果兄弟","badges":[],"total_fp_amount":"8114874055201809467","wordage":49480,"avatar":"https://upload.jianshu.io/users/upload_avatars/2152669/5460957dbc68.jpg","id":2152669,"liked_user":false},"likes_count":0,"paid_type":"free","show_ads":true,"paid_content_accessible":false,"total_fp_amount":"0","trial_open":false,"reprintable":true,"bookmarked":false,"wordage":1390,"featured_comments_count":0,"downvotes_count":0,"wangxin_trial_open":null,"guideShow":{"new_money_time_reward_type":5,"audit_user_nickname_spliter":0,"pc_note_bottom_btn":1,"pc_like_author_guidance":0,"audit_user_background_image_spliter":0,"audit_note_spliter":0,"launch_tab":1,"include_post":0,"pc_login_guidance":1,"audit_comment_spliter":1,"pc_note_bottom_qrcode":1,"audit_user_avatar_spliter":0,"flow_ad_check_detail_button_style":0,"audit_collection_spliter":0,"pc_top_lottery_guidance":1,"subscription_guide_entry":1,"creation_muti_function_on":1,"audit_user_spliter":1,"pc_note_popup":0},"commentable":true,"total_rewards_count":0,"id":23239084,"notebook":{"name":""},"description":"本文更新(移步查阅):19-04-15 新采集了2018的省市区三级坐标和行政区域边界19-03-22 采集了2018的城市数据18-11-28 采集了2017的城市数据数据...","first_shared_at":1517035911,"views_count":567,"notebook_id":21660119},"baseList":{"likeList":[],"rewardList":[]},"status":"success","statusCode":0},"user":{"isLogin":false,"userInfo":{}},"comments":{"list":[],"featuredList":[]}},"initialProps":{"pageProps":{"query":{"slug":"fb5bba916924"}},"localeData":{"common":{"jianshu":"简书","diamond":"简书钻","totalAssets":"总资产{num}","diamondValue":" (约{num}元)","login":"登录","logout":"注销","register":"注册","on":"开","off":"关","follow":"关注","followBook":"关注连载","following":"已关注","cancelFollow":"取消关注","publish":"发布","wordage":"字数","audio":"音频","read":"阅读","reward":"赞赏","zan":"赞","comment":"评论","expand":"展开","prevPage":"上一页","nextPage":"下一页","floor":"楼","confirm":"确定","delete":"删除","report":"举报","fontSong":"宋体","fontBlack":"黑体","chs":"简体","cht":"繁体","jianChat":"简信","postRequest":"投稿请求","likeAndZan":"喜欢和赞","rewardAndPay":"赞赏和付费","home":"我的主页","markedNotes":"收藏的文章","likedNotes":"喜欢的文章","paidThings":"已购内容","wallet":"我的钱包","setting":"设置","feedback":"帮助与反馈","loading":"加载中...","needLogin":"请登录后进行操作","trialing":"文章正在审核中...","reprintTip":"禁止转载,如需转载请通过简信或评论联系作者。"},"error":{"rewardSelf":"无法打赏自己的文章哟~"},"message":{"paidNoteTip":"付费购买后才可以参与评论哦","CommentDisableTip":"作者关闭了评论功能","contentCanNotEmptyTip":"回复内容不能为空","addComment":"评论发布成功","deleteComment":"评论删除成功","likeComment":"评论点赞成功","setReadMode":"阅读模式设置成功","setFontType":"字体设置成功","setLocale":"显示语言设置成功","follow":"关注成功","cancelFollow":"取消关注成功","copySuccess":"复制代码成功"},"header":{"homePage":"首页","download":"下载APP","discover":"发现","message":"消息","reward":"赞赏支持","editNote":"编辑文章","writeNote":"写文章"},"note":{},"noteMeta":{"lastModified":"最后编辑于 ","wordage":"字数 {num}","viewsCount":"阅读 {num}"},"divider":{"selfText":"以下内容为付费内容,定价 ¥{price}","paidText":"已付费,可查看以下内容","notPaidText":"还有 {percent} 的精彩内容","modify":"点击修改"},"paidPanel":{"buyNote":"支付 ¥{price} 继续阅读","buyBook":"立即拿下 ¥{price}","freeTitle":"该作品为付费连载","freeText":"购买即可永久获取连载内的所有内容,包括将来更新的内容","paidTitle":"还没看够?拿下整部连载!","paidText":"永久获得连载内的所有内容, 包括将来更新的内容"},"book":{"last":"已是最后","lookCatalog":"查看连载目录","header":"文章来自以下连载"},"action":{"like":"{num}人点赞","collection":"收入专题","report":"举报文章"},"comment":{"allComments":"全部评论","featuredComments":"精彩评论","closed":"评论已关闭","close":"关闭评论","open":"打开评论","desc":"按时间倒序","asc":"按时间正序","disableText1":"用户已关闭评论,","disableText2":"与Ta简信交流","placeholder":"写下你的评论...","publish":"发表","create":" 添加新评论","reply":" 回复","restComments":"还有{num}条评论,","expandImage":"展开剩余{num}张图","deleteText":"确定要删除评论么?"},"collection":{"title":"被以下专题收入,发现更多相似内容","putToMyCollection":"收入我的专题"},"seoList":{"title":"推荐阅读","more":"更多精彩内容"},"sideList":{"title":"推荐阅读"},"wxShareModal":{"desc":"打开微信“扫一扫”,打开网页后点击屏幕右上角分享按钮"},"bookChapterModal":{"try":"试读","toggle":"切换顺序"},"collectionModal":{"title":"收入到我管理的专题","search":"搜索我管理的专题","newCollection":"新建专题","create":"创建","nothingFound":"未找到相关专题","loadMore":"展开查看更多"},"contributeModal":{"search":"搜索专题投稿","newCollection":"新建专题","addNewOne":"去新建一个","nothingFound":"未找到相关专题","loadMore":"展开查看更多","managed":"我管理的专题","recommend":"推荐专题"},"QRCodeShow":{"payTitle":"微信扫码支付","payText":"支付金额"},"rewardModal":{"title":"给作者送糖","custom":"自定义","placeholder":"给Ta留言...","choose":"选择支付方式","balance":"简书余额","tooltip":"网站该功能暂时下线,如需使用,请到简书App操作","confirm":"确认支付","success":"赞赏成功"},"payModal":{"payBook":"购买连载","payNote":"购买文章","promotion":"优惠券","promotionFetching":"优惠券获取中...","noPromotion":"无可用优惠券","promotionNum":"{num}张可用","noUsePromotion":"不使用优惠券","validPromotion":"可用优惠券","invalidPromotion":"不可用优惠券","total":"支付总额","tip1":"· 你将购买的商品为虚拟内容服务,购买后不支持退订、转让、退换,请斟酌确认。","tip2":"· 购买后可在“已购内容”中查看和使用。","success":"购买成功"},"reportModal":{"ad":"广告及垃圾信息","plagiarism":"抄袭或未授权转载","placeholder":"写下举报的详情情况(选填)","success":"举报成功"},"guidModal":{"modalAText":"相似文章推荐","subText":"下载简书APP,浏览更多相似文章","btnAText":"先不下载,下次再说","followOkText":"关注作者成功!","followTextTip":"下载简书APP,作者更多精彩内容更新及时提醒!","followBtn":"下次再说","downloadTipText":"更多精彩内容下载简书APP","footerDownLoadText":"下载简书APP","modabTitle":"免费送你2次抽奖机会","modalbTip":"你有很大概率抽取AirPods Pro","modalbFooterTip":"下载简书APP,天天参与抽大奖","modalReward":"抽奖","scanQrtip":"扫码下载简书APP","downloadAppText":"下载简书APP,随时随地发现和创作内容","redText":"阅读","likesText":"赞","downLoadLeft":"下载App"}},"currentLocale":"zh-CN","asPath":"/p/fb5bba916924"}},"page":"/p/[slug]","query":{"slug":"fb5bba916924"},"buildId":"ZJP8vj8XvQ-o-3nKSjb0s","assetPrefix":"https://cdn2.jianshu.io/shakespeare"}

文章来源于互联网,如有雷同请联系站长删除:从统计局采集最新的省市区县数据,纯js

发表评论