SNAP Library , User Reference
2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 // Graph Utilities 00003 void TGUtil::GetCdf(const TIntPrV& PdfV, TIntPrV& CdfV) { 00004 CdfV = PdfV; 00005 for (int i = 1; i < CdfV.Len(); i++) { 00006 CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; } 00007 } 00008 00009 void TGUtil::GetCdf(const TFltPrV& PdfV, TFltPrV& CdfV) { 00010 CdfV = PdfV; 00011 for (int i = 1; i < CdfV.Len(); i++) { 00012 CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; } 00013 } 00014 00015 void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) { 00016 CdfV = PdfV; 00017 for (int i = 1; i < CdfV.Len(); i++) { 00018 CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; } 00019 } 00020 00021 TIntPrV TGUtil::GetCdf(const TIntPrV& PdfV) { 00022 TIntPrV CdfV; 00023 GetCdf(PdfV, CdfV); 00024 return CdfV; 00025 } 00026 00027 TFltPrV TGUtil::GetCdf(const TFltPrV& PdfV) { 00028 TFltPrV CdfV; 00029 GetCdf(PdfV, CdfV); 00030 return CdfV; 00031 } 00032 00033 void TGUtil::GetCCdf(const TIntPrV& PdfV, TIntPrV& CCdfV) { 00034 CCdfV = PdfV; 00035 for (int i = CCdfV.Len()-2; i >= 0; i--) { 00036 CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; } 00037 } 00038 00039 void TGUtil::GetCCdf(const TFltPrV& PdfV, TFltPrV& CCdfV) { 00040 CCdfV = PdfV; 00041 for (int i = CCdfV.Len()-2; i >= 0; i--) { 00042 CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; } 00043 } 00044 00045 void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) { 00046 CCdfV = PdfV; 00047 for (int i = CCdfV.Len()-2; i >= 0; i--) { 00048 CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; } 00049 } 00050 00051 TIntPrV TGUtil::GetCCdf(const TIntPrV& PdfV) { 00052 TIntPrV CCdfV; 00053 GetCCdf(PdfV, CCdfV); 00054 return CCdfV; 00055 } 00056 00057 TFltPrV TGUtil::GetCCdf(const TFltPrV& PdfV) { 00058 TFltPrV CCdfV; 00059 GetCCdf(PdfV, CCdfV); 00060 return CCdfV; 00061 } 00062 00063 void TGUtil::GetPdf(const TIntPrV& CdfV, TIntPrV& PdfV) { 00064 PdfV = CdfV; 00065 for (int i = PdfV.Len()-1; i > 0; i--) { 00066 PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; } 00067 } 00068 00069 void TGUtil::GetPdf(const TFltPrV& CdfV, TFltPrV& PdfV) { 00070 PdfV = CdfV; 00071 for (int i = PdfV.Len()-1; i > 0; i--) { 00072 PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; } 00073 } 00074 00075 void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) { 00076 PdfV = CdfV; 00077 for (int i = PdfV.Len()-1; i > 0; i--) { 00078 PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; } 00079 } 00080 00081 void TGUtil::Normalize(TFltPrV& PdfV) { 00082 double Sum = 0.0; 00083 for (int i = 0; i < PdfV.Len(); i++) { 00084 Sum += PdfV[i].Val2; } 00085 if (Sum <= 0.0) { return; } 00086 for (int i = 0; i < PdfV.Len(); i++) { 00087 PdfV[i].Val2 /= Sum; } 00088 } 00089 00090 void TGUtil::Normalize(TIntFltKdV& PdfV) { 00091 double Sum = 0.0; 00092 for (int i = 0; i < PdfV.Len(); i++) { 00093 Sum += PdfV[i].Dat; } 00094 if (Sum <= 0.0) { return; } 00095 for (int i = 0; i < PdfV.Len(); i++) { 00096 PdfV[i].Dat /= Sum; } 00097 } 00098 00099 void TGUtil::MakeExpBins(const TFltPrV& XYValV, TFltPrV& ExpXYValV, const double& BinFactor, const double& MinYVal) { 00100 TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal); 00101 } 00102 00103 void TGUtil::MakeExpBins(const TFltKdV& XYValV, TFltKdV& ExpXYValV, const double& BinFactor, const double& MinYVal) { 00104 TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal); 00105 } 00106 00107 void TGUtil::MakeExpBins(const TFltV& YValV, TFltV& ExpYValV, const double& BinFactor) { 00108 ExpYValV.Clr(true); 00109 int prevI=0; 00110 for (int i = 0; i < YValV.Len(); ) { 00111 ExpYValV.Add(YValV[i]); 00112 i = int(i*BinFactor); 00113 if (i==prevI) { i++; } 00114 prevI = i; 00115 } 00116 } 00117 00118 void TGUtil::MakeExpBins(const TIntV& YValV, TIntV& ExpYValV, const double& BinFactor) { 00119 ExpYValV.Clr(true); 00120 int prevI=0; 00121 for (int i = 0; i < YValV.Len(); ) { 00122 ExpYValV.Add(YValV[i]); 00123 i = int(i*BinFactor); 00124 if (i==prevI) { i++; } 00125 prevI = i; 00126 } 00127 } 00128 00130 // String helper functions and utilities 00131 // get <TagNm>TagVal</TagNm> 00132 TChA& TStrUtil::GetXmlTagVal(TXmlLx& XmlLx, const TChA& TagNm) { 00133 static TChA TagVal; 00134 EAssertR(XmlLx.GetSym() == xsySTag, TagNm); 00135 EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm); 00136 const TXmlLxSym NextSym = XmlLx.GetSym(); 00137 TagVal = XmlLx.TxtChA; 00138 if (NextSym == xsyStr) { 00139 EAssertR(XmlLx.GetSym() == xsyETag, TagNm); 00140 } else { 00141 EAssertR(NextSym == xsyETag, TagNm); // empty tag 00142 //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr()); 00143 } 00144 EAssertR(XmlLx.TagNm == TagNm, TagNm); 00145 return TagVal; 00146 } 00147 00148 // get <TagNm>TagVal</TagNm> 00149 void TStrUtil::GetXmlTagNmVal(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal) { 00150 EAssertR(XmlLx.GetSym() == xsySTag, TagNm); 00151 TagNm = XmlLx.TagNm; 00152 const TXmlLxSym NextSym = XmlLx.GetSym(); 00153 TagVal = XmlLx.TxtChA; 00154 if (NextSym == xsyStr) { 00155 EAssertR(XmlLx.GetSym() == xsyETag, TagNm); 00156 } else { 00157 EAssertR(NextSym == xsyETag, TagNm); // empty tag 00158 //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr()); 00159 } 00160 } 00161 00162 // get <TagNm>*</TagNm> (can be many tags inbetween 00163 bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) { 00164 if (XmlLx.GetSym() != xsySTag) { 00165 return false; } 00166 TagVal.Clr(); 00167 TagNm = XmlLx.TagNm; 00168 //const TXmlLxSym NextSym = XmlLx.GetSym(); 00169 while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) { 00170 if (TakeTagNms) { 00171 TagVal += XmlLx.TxtChA; } 00172 else if (XmlLx.Sym == xsyStr) { 00173 TagVal += XmlLx.TxtChA; } 00174 XmlLx.GetSym(); 00175 } 00176 return true; 00177 //if (NextSym == xsyStr) { 00178 // EAssertR(XmlLx.GetSym() == xsyETag, TagNm); 00179 //} else { 00180 // EAssertR(NextSym == xsyETag, TagNm); // empty tag 00181 // printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr()); 00182 //} 00183 } 00184 00185 00186 // http://www.ijs.si/fdfd/blah.html --> www.ijs.si 00187 TChA TStrUtil::GetDomNm(const TChA& UrlChA) { 00188 int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http:// 00189 if (EndSlash > 0) { 00190 const int BegSlash = UrlChA.SearchChBack('/', EndSlash); 00191 if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); } 00192 else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); } 00193 } else { 00194 if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); } 00195 EndSlash = UrlChA.SearchCh('/', 0); 00196 if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); } 00197 else { return TChA(UrlChA).ToLc(); } 00198 } 00199 } 00200 // get domain name and also strip starting www. 00201 TChA TStrUtil::GetDomNm2(const TChA& UrlChA) { 00202 TChA Dom = GetDomNm(UrlChA); 00203 if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); } 00204 else { return Dom; } 00205 } 00206 00207 int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') { 00208 const char *c = Url.CStr(); 00209 int cnt = 0; 00210 while (*c && cnt != Count) { 00211 if (*c == Ch) { cnt++; } 00212 c++; 00213 } 00214 return int(c-Url.CStr()-1); 00215 } 00216 00217 // get website (GetDomNm2 or blog url) 00218 TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) { 00219 TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr); 00220 // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539 00221 if (DomNm == "blog.myspace.com") { 00222 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1); 00223 } 00224 // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta 00225 // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx 00226 // http://ameblo.jp/baptism/entry-10126216277.html 00227 // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q 00228 // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php 00229 // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the 00230 // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html 00231 // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx 00232 // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx 00233 // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html 00234 // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo 00235 // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the 00236 // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html 00237 // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx 00238 // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx 00239 // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html 00240 // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo 00241 // http://blogs.zdnet.com/hardware/?p=2391 00242 // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php 00243 // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html 00244 // http://blog.tv2.dk/ole.mork/entry254689.html 00245 // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp 00246 // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html 00247 // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo 00248 if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com" 00249 || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co" 00250 || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" 00251 || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com" 00252 || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk" 00253 || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") { 00254 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); 00255 } 00256 // http://digg.com/submit?phase=2&url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&title=and 00257 // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show 00258 if (DomNm == "digg.com") { 00259 if (PostUrlStr.IsPrefix("http://digg.com/submit?")) { 00260 const int Url = PostUrlStr.SearchStr(";url="); 00261 if (Url != -1) { 00262 return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); } 00263 } else { 00264 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); } 00265 } 00266 // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/ 00267 // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html 00268 // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html 00269 // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas 00270 // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html 00271 if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/") 00272 || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) { 00273 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 00274 } 00275 // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640 00276 if (DomNm=="feeds.feedburner.com") { 00277 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 00278 } 00279 // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c 00280 if (DomNm=="groups.google.com") { 00281 return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 00282 } 00283 // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa 00284 if (DomNm=="news.google.com") { // redirect 00285 const int UrlPos = PostUrlStr.SearchStr("&url="); 00286 if (UrlPos != -1) { 00287 return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); } 00288 } 00289 // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de 00290 if (DomNm == "bloggrevyen.no") { // redirect 00291 const int Http2 = PostUrlStr.SearchStr("/http://"); 00292 if (Http2!=-1) { 00293 return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); } 00294 } 00295 //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn 00296 //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha 00297 if (DomNm.IsSuffix(".rd.yahoo.com")) { 00298 const int Http2 = PostUrlStr.SearchStr("/*"); 00299 if (Http2!=-1) { 00300 return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); } 00301 } 00302 return DomNm; 00303 } 00304 00306 bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) { 00307 UrlOut = UrlIn; 00308 if (StripEnd(UrlIn, "/", UrlOut)) {} 00309 else if (StripEnd(UrlIn, "/index.html", UrlOut)) {} 00310 else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {} 00311 else if (StripEnd(UrlIn, "/index.php", UrlOut)) {} 00312 if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) { 00313 // if UrlIn is relative url, try combine it with BaseUrl 00314 if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) { 00315 //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr()); 00316 return false; } 00317 TChA Out; 00318 if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; } 00319 if (UrlIn[0] != '/') { Out.AddCh('/'); } 00320 Out += UrlOut; 00321 UrlOut = Out; 00322 } 00323 // http://www. --> http:// 00324 if (UrlOut.IsPrefix("http://www.")) { 00325 UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx); 00326 } 00327 UrlOut.ToLc(); 00328 return true; 00329 } 00330 00331 bool TStrUtil::StripEnd(const TChA& Str, const TChA& SearchStr, TChA& NewStr) { 00332 const int StrLen = Str.Len(); 00333 const int SearchStrLen = SearchStr.Len(); 00334 if (StrLen < SearchStrLen) { return false; } 00335 for (int i = 0; i < SearchStrLen; i++) { 00336 if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; } 00337 } 00338 NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1); 00339 return true; 00340 } 00341 00342 TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) { 00343 if (LongStr.Len() < MaxLen) { return LongStr; } 00344 TChA Str = LongStr.GetSubStr(0, MaxLen-1); 00345 Str += "..."; 00346 return Str; 00347 } 00348 00349 // space separated sequence of words, remove all punctuations, etc. 00350 TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) { 00351 char *b = (char *) ChA.CStr(); 00352 while (*b && ! TCh::IsAlNum(*b)) { b++; } 00353 if (*b == 0) { return TChA(); } 00354 TChA OutChA(ChA.Len()); 00355 char *e = b, tmp; 00356 while (*e) { 00357 b = e; 00358 while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; } 00359 if (b < e) { 00360 tmp = *e; *e=0; 00361 OutChA += b; OutChA.AddCh(' '); 00362 *e = tmp; 00363 } 00364 while (*e && ! TCh::IsAlNum(*e)) { e++; } 00365 if (! *e) { break; } 00366 } 00367 OutChA.DelLastCh(); OutChA.ToLc(); 00368 return OutChA; 00369 } 00370 00371 // space seprated sequence of words (includes all non-blank characters, i.e., punctuations) 00372 TChA TStrUtil::GetCleanStr(const TChA& ChA) { 00373 char *b = (char *) ChA.CStr(); 00374 while (*b && ! TCh::IsAlNum(*b)) { b++; } 00375 if (*b == 0) { return TChA(); } 00376 TChA OutChA(ChA.Len()); 00377 char *e = b; 00378 bool ws=false; 00379 while (*e) { 00380 while (*e && TCh::IsWs(*e)) { e++; ws=true; } 00381 if (! *e) { break; } 00382 if (ws) { OutChA.AddCh(' '); ws=false; } 00383 OutChA.AddCh(*e); 00384 e++; 00385 } 00386 //OutChA.ToLc(); 00387 return OutChA; 00388 } 00389 int TStrUtil::CountWords(const TChA& ChA) { 00390 return CountWords(ChA.CStr()); 00391 } 00392 00393 int TStrUtil::CountWords(const char* CStr) { 00394 int WrdCnt = 1; 00395 for (const char *c = CStr; *c; c++) { 00396 if (TCh::IsWs(*c)) { WrdCnt++; } 00397 } 00398 return WrdCnt; 00399 } 00400 00401 int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) { 00402 TChA Tmp; 00403 TVec<char *> WrdV; 00404 SplitWords(Tmp, WrdV); 00405 int SWordCnt = 0; 00406 for (int w = 0; w < WrdV.Len(); w++) { 00407 if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; } 00408 } 00409 return WrdV.Len() - SWordCnt; 00410 } 00411 00412 int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) { 00413 WrdV.Clr(false); 00414 WrdV.Add(ChA.CStr()); 00415 for (char *c = (char *) ChA.CStr(); *c; c++) { 00416 if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) { 00417 *c = 0; 00418 if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } 00419 WrdV.Add(c+1); 00420 } 00421 } 00422 return WrdV.Len(); 00423 } 00424 00425 int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) { 00426 WrdV.Clr(false); 00427 WrdV.Add(ChA.CStr()); 00428 for (char *c = (char *) ChA.CStr(); *c; c++) { 00429 if (*c == Ch) { 00430 *c = 0; 00431 if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } 00432 WrdV.Add(c+1); 00433 } 00434 } 00435 if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } 00436 return WrdV.Len(); 00437 } 00438 00439 int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) { 00440 LineV.Clr(false); 00441 LineV.Add(ChA.CStr()); 00442 bool IsChs=false; 00443 for (char *c = (char *) ChA.CStr(); *c; c++) { 00444 if (*c == '\n') { 00445 if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n 00446 *c=0; 00447 if (SkipEmpty) { 00448 if (IsChs) { LineV.Add(c+1); } 00449 } else { 00450 LineV.Add(c+1); 00451 } 00452 IsChs=false; 00453 } else { 00454 IsChs=true; 00455 } 00456 } 00457 return LineV.Len(); 00458 } 00459 00460 int TStrUtil::SplitSentences(TChA& ChA, TVec<char *>& SentenceV) { 00461 SentenceV.Clr(); 00462 const char *B = ChA.CStr(); 00463 const char *E = B+ChA.Len(); 00464 char *c = (char *) B; 00465 while (*c && TCh::IsWs(*c)) { c++; } 00466 if (*c) { SentenceV.Add(c); } else { return 0; } 00467 for (; c < E; c++) { 00468 if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence 00469 if (c<E && *(c+1)=='"') { *c='"'; c++; } // blah." --> blah" 00470 if (c>=E) { continue; } 00471 *c=0; c++; 00472 char *e = c-1; 00473 while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars 00474 while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum 00475 if (c<E) { SentenceV.Add(c); } 00476 } 00477 } 00478 return SentenceV.Len(); 00479 } 00480 00481 void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) { 00482 TextStr.Clr(); 00483 char *StrB, *StrE; 00484 // use full page html: skip till <body> 00485 //PageHtmlStr = "<script fdsfs> fsdfsd </script> jure"; 00486 /*if (UseFullHtml) { 00487 StrB = PageHtmlStr.CStr(); 00488 StrE = StrB+PageHtmlStr.Len(); 00489 char * NewB = strstr(StrB, "<body>"); 00490 if (NewB != NULL) { StrB = NewB+6; } 00491 char * NewE = strstr(StrB, "body>"); 00492 if (NewE != NULL) { 00493 while (true) { 00494 char *E=strstr(NewE+4, "body>"); 00495 if (E == NULL) { break; } NewE = E; } 00496 StrE = NewE; 00497 } 00498 } else { // only extracted post html*/ 00499 StrB = (char *) HtmlStr.CStr(); 00500 StrE = (char *) StrB+HtmlStr.Len(); //} 00501 for (char *e = StrB; e < StrE; ) { 00502 char* b = e; 00503 while (e<StrE && *e != '<') { e++; } 00504 // copy text 00505 char tmp=*e; *e = 0; 00506 TextStr+= b; TextStr.AddCh(' '); *e = tmp; 00507 if (e >= StrE) { return; } 00508 // if start of a comment: skip 00509 if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment 00510 e += 3; 00511 while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; } 00512 e++; continue; 00513 } 00514 // if "<script" then skip 00515 if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') { 00516 e += 5; 00517 while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; } 00518 e++; continue; 00519 } 00520 // skip to end of tag 00521 while (e < StrE && *e != '>') { e++; } 00522 if (e>=StrE) { return; } 00523 e++; 00524 } 00525 } 00526 00527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) { 00528 int AlNumCnt=0, ChCnt=0; 00529 for (const char *c = Str.CStr(); *c; c++) { 00530 if (TCh::IsWs(*c)) { continue; } 00531 if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; } 00532 ChCnt++; 00533 } 00534 if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; } 00535 return false; 00536 } 00537 00538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { 00539 const int NotWId = -1; 00540 TChA ChA(CStr); 00541 TVec<char *> WrdV; 00542 TInt WId; 00543 TStrUtil::SplitWords(ChA, WrdV); 00544 WIdV.Clr(false); 00545 for (int w = 0; w < WrdV.Len(); w++) { 00546 if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); } 00547 else { WIdV.Add(NotWId); } 00548 } 00549 } 00550 00551 // and words to StrH and get a vector of word ids 00552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { 00553 TChA ChA(CStr); 00554 TVec<char *> WrdV; 00555 TInt WId; 00556 TStrUtil::SplitWords(ChA, WrdV); 00557 WIdV.Clr(false); 00558 for (int w = 0; w < WrdV.Len(); w++) { 00559 WIdV.Add(StrH.AddDatId(WrdV[w])); 00560 } 00561 } 00562 00563 // Parse time in various formats: 00564 // 10:16, 16 Sep 2004 00565 // 10:20, 2004 Sep 16 00566 // 2005-07-07 20:30:35 00567 // 23:24:07, 2005-07-10 00568 // 9 July 2005 14:38 00569 // 21:16, July 9, 2005 00570 // 06:02, 10 July 2005 00571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) { 00572 static TStrV MonthV1, MonthV2; 00573 if (MonthV1.Empty()) { 00574 TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1); 00575 TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2); 00576 } 00577 TChA Tmp(TmStr); 00578 Tmp.ToLc(); 00579 TVec<char *> WrdV; 00580 const char* End = Tmp.CStr()+Tmp.Len(); 00581 int Col = -1, Cols=0; 00582 for (char *b = Tmp.CStr(); b <End; ) { 00583 WrdV.Add(b); 00584 while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; } 00585 if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++; } 00586 *b=0; b++; 00587 while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; } 00588 } 00589 if (Cols == 2) { 00590 if (Col+1 >= WrdV.Len()) { return false; } 00591 WrdV.Del(Col+1); 00592 } 00593 if (Col<1) { return false; } 00594 const int Hr = atoi(WrdV[Col-1]); 00595 const int Min = atoi(WrdV[Col]); 00596 WrdV.Del(Col); WrdV.Del(Col-1); 00597 if (WrdV.Len() != 3) { return false; } 00598 int y=0,m=1,d=2, Mon=-1; 00599 if (TCh::IsAlpha(WrdV[0][0])) { 00600 y=2; m=0; d=1; 00601 } else if (TCh::IsAlpha(WrdV[1][0])) { 00602 y=2; m=1; d=0; 00603 } else if (TCh::IsAlpha(WrdV[2][0])) { 00604 y=0; m=2; d=1; 00605 } else { 00606 y=0; m=1; d=2; 00607 Mon = atoi(WrdV[m]); 00608 } 00609 int Day = atoi(WrdV[d]); 00610 if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; } 00611 if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; } 00612 if (Mon == 0) { return false; } 00613 int Year = atoi(WrdV[y]); 00614 if (Day > Year) { ::Swap(Day, Year); } 00615 //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min); 00616 Tm = TSecTm(Year, Mon, Day, Hr, Min, 0); 00617 return true; 00618 } 00619 00620 // Standardize first and lastnames into <last_name>_<first name innitial> 00621 TStr TStrUtil::GetStdName(TStr AuthorName) { 00622 TStr StdName; 00623 AuthorName.ToLc(); 00624 AuthorName.ChangeChAll('\n', ' '); 00625 AuthorName.ChangeChAll('.', ' '); 00626 // if there is a number in the name, remove it and everything after it 00627 int i, pos = 0; 00628 while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) { 00629 pos++; } 00630 if (pos < AuthorName.Len()) { 00631 AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } 00632 if (AuthorName.Empty()) { return TStr::GetNullStr(); } 00633 00634 // replace everything after '(' 00635 int b = AuthorName.SearchCh('('); 00636 if (b != -1) { 00637 AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } 00638 // skip if contains ')' 00639 if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); } 00640 // skip if it is not a name 00641 if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 00642 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { 00643 return TStr::GetNullStr(); 00644 } 00645 // remove all non-letters (latex tags, ...) 00646 TChA NewName; 00647 for (i = 0; i < AuthorName.Len(); i++) { 00648 const char Ch = AuthorName[i]; 00649 if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; } 00650 } 00651 StdName = NewName; StdName.ToTrunc(); 00652 TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); 00653 // too short -- not a name 00654 if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); 00655 if (AuthNmV.Len() < 2) return TStr::GetNullStr(); 00656 00657 const TStr LastNm = AuthNmV.Last(); 00658 if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); 00659 00660 IAssert(isalpha(AuthNmV[0][0])); 00661 return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); 00662 } 00663 00664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) { 00665 AuthorNames.ChangeChAll('\n', ' '); 00666 AuthorNames.ToLc(); 00667 // split into author names 00668 TStrV AuthV, TmpV, Tmp2V; 00669 // split on 'and' 00670 AuthorNames.SplitOnStr(" and ", TmpV); 00671 int i; 00672 for (i = 0; i < TmpV.Len(); i++) { 00673 TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); } 00674 // split on '&' 00675 TmpV = AuthV; AuthV.Clr(); 00676 for (i = 0; i < TmpV.Len(); i++) { 00677 TmpV[i].SplitOnAllCh('&', Tmp2V); AuthV.AddV(Tmp2V); } 00678 // split on ',' 00679 TmpV = AuthV; AuthV.Clr(); 00680 for (i = 0; i < TmpV.Len(); i++) { 00681 TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); } 00682 // split on ';' 00683 TmpV = AuthV; AuthV.Clr(); 00684 for (i = 0; i < TmpV.Len(); i++) { 00685 TmpV[i].SplitOnAllCh(';', Tmp2V); AuthV.AddV(Tmp2V); } 00686 // standardize names 00687 StdNameV.Clr(); 00688 //printf("\n*** %s\n", AuthorNames.CStr()); 00689 for (i = 0; i < AuthV.Len(); i++) { 00690 TStr StdName = GetStdName(AuthV[i]); 00691 if (! StdName.Empty()) { 00692 //printf("\t%s ==> %s\n", AuthV[i].CStr(), StdName.CStr()); 00693 StdNameV.Add(StdName); 00694 } 00695 } 00696 }