Atitit attilax擅长项目解析与大数据采集提取 词法分析 电话号码提取
packagevcfvcardprj;
importjava.util.Collection;
importjava.util.List;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
importcom.alibaba.fastjson.JSON;
importcom.attilax.fsm.TokenEndEx;
importcom.attilax.parser.Token;
importmon.collect.Lists;
publicclassmblFetch{
publicstaticvoidmain(String[]args){
//TODOAuto-generated method stub
Strings="周何琪__学校郑州大学__联系方式15538130516__身高体重162cm,47k";
s="天津广播影视职业学院 韩震宇 15641656234 161cm,44kg";
List<Token>process=newmblFetch().getTokens(s);
System.out.println(JSON.toJSONString(process,true));
Stringcp=getMblCp(process);
System.out.println(cp);
}
privatestaticStringgetMblCp(List<Token>process){
for(Tokentoken:process){
if(newmblFetch().isnum(token.value))
returntoken.value;
}
return"";
}
privatechar[]process(Strings){
//TODOAuto-generated method stub
returnnull;
}
intcharIndex;
charcur_char;
char[]code_char_arr;
privateStringcurStat="start";
privateList<Token>tokens_tmp;
privateStringcurTokenTxt="";
@SuppressWarnings("unchecked")
publicList<Token>getTokens(StringcodeStr){
List<Token>li=Lists.newArrayList();
code_char_arr=codeStr.toCharArray();
while(true){
Objecttk;
try{
tk=nextTokens();
}catch(TokenEndExe){
break;
}
if(tkinstanceofToken)
li.add((Token)tk);
elseif(tkinstanceofList)
li.addAll((Collection<?extendsToken>)tk);
else
thrownewRuntimeException("token type err,curchar:"+cur_char+",colidx:"+charIndex);
}
returnli;
}
/**
*
*@returntoken or list<token>
*@throwsTokenEndEx
*/
publicObjectnextTokens()throwsTokenEndEx{
// code_char_arr = code.toCharArray();
charIndex++;
if(charIndex>code_char_arr.length-1)
thrownewTokenEndEx(newString(code_char_arr));
cur_char=code_char_arr[charIndex];
// cur_char=cur_char;
// if (this.curTokenTxt.equals("1598"))
// System.out.println("dbg");
// if(this.gColumn==30)
// System.out.println("dbg");
// get next char,,then changestat
//judecur char and curstat...then if or notchagestat
if(ishanzi(cur_char))
returnhanziEvt();
elseif(isnum(cur_char))
returnnumEvt();
else
returnsplitorCharEvt();
// break;
}
privateObjectnumEvt()throwsTokenEndEx{
if(this.curStat.equals("start")){
this.curStat="numStat";
returngaziStat();
}
if(this.curStat.equals("numStat")){
returngaziStat();
}
if(this.curStat.equals("hanziStat")){
this.curStat="numStat";
returnretNumtoken();
}
if(this.curStat.equals("splitorStat")){
this.curStat="numStat";
returnretSplitorToken();
}
returnnull;
}
privateObjecthanziEvt()throwsTokenEndEx{
if(this.curStat.equals("start")){
this.curStat="hanziStat";
returngaziStat();
}
if(this.curStat.equals("hanziStat")){
returngaziStat();
}
// if ishanzi&& cur isnumstat
if(this.curStat.equals("numStat")){
this.curStat="hanziStat";
returnretNumtoken();
}
if(this.curStat.equals("splitorStat")){
this.curStat="hanziStat";
returnretSplitorToken();
}
this.curStat="hanziStat";
returnnull;
}
privateObjectsplitorCharEvt()throwsTokenEndEx{
if(this.curStat.equals("start")){
this.curStat="splitorStat";
returngaziStat();
}
if(this.curStat.equals("hanziStat")){
this.curStat="splitorStat";
returnretHeziToken();
}
if(this.curStat.equals("numStat")){
this.curStat="splitorStat";
returnretNumtoken();
}
//gazi
this.curStat="splitorStat";
returngaziStat();
}
privateObjectretHeziToken(){
Tokentk=newToken();
tk.Text=curTokenTxt.toString();
tk.Type="hezi";
tk.value=curTokenTxt.toString();
curTokenTxt=String.valueOf(cur_char);
returntk;
}
privateObjectretNumtoken(){
Tokentk=newToken();
tk.Text=curTokenTxt.toString();
tk.Type="num";
tk.value=curTokenTxt.toString();
curTokenTxt="";
curTokenTxt=String.valueOf(cur_char);
returntk;
}
privateObjectretSplitorToken(){
Tokentk=newToken();
tk.Text=curTokenTxt.toString();
tk.Type="splitor";
tk.value=curTokenTxt.toString();
curTokenTxt=""; curTokenTxt=String.valueOf(cur_char);
returntk;
}
privateObjectgaziStat()throwsTokenEndEx{
curTokenTxt=curTokenTxt+String.valueOf(cur_char);
returnnextTokens();
}
privatebooleanishanzi(charcur_char2){
returnisChinese(String.valueOf(cur_char2));
}
privatebooleanisnum(charcur_char2){
Stringstr=String.valueOf(cur_char2);
returnisnum(str);
}
privatebooleanisnum(Stringstr){
for(inti=str.length();--i>=0;){
if(!Character.isDigit(str.charAt(i))){
returnfalse;
}
}
returntrue;
}
publicstaticbooleanisChinese(Stringstr){
StringregEx="[\u4e00-\u9fa5]";
Patternpat=Pattern.compile(regEx);
Matchermatcher=pat.matcher(str);
booleanflg=false;
if(matcher.find())
flg=true;
returnflg;
}
}
Atitit attilax擅长项目解析与大数据采集提取 词法分析 电话号码提取 package vcfvcardprj; import java.util.Collection; imp