SNAP Library , User Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
util.cpp
Go to the documentation of this file.
00001 
00002 // Graph Utilities
00003 void TGUtil::GetCdf(const TIntPrV& PdfV, TIntPrV& CdfV) {
00004   CdfV = PdfV;
00005   for (int i = 1; i < CdfV.Len(); i++) {
00006     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
00007 }
00008 
00009 void TGUtil::GetCdf(const TFltPrV& PdfV, TFltPrV& CdfV) {
00010   CdfV = PdfV;
00011   for (int i = 1; i < CdfV.Len(); i++) {
00012     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
00013 }
00014 
00015 void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) {
00016   CdfV = PdfV;
00017   for (int i = 1; i < CdfV.Len(); i++) {
00018     CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
00019 }
00020 
00021 TIntPrV TGUtil::GetCdf(const TIntPrV& PdfV) {
00022   TIntPrV CdfV;
00023   GetCdf(PdfV, CdfV);
00024   return CdfV;
00025 }
00026 
00027 TFltPrV TGUtil::GetCdf(const TFltPrV& PdfV) {
00028   TFltPrV CdfV;
00029   GetCdf(PdfV, CdfV);
00030   return CdfV;
00031 }
00032 
00033 void TGUtil::GetCCdf(const TIntPrV& PdfV, TIntPrV& CCdfV) {
00034   CCdfV = PdfV;
00035   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00036     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
00037 }
00038 
00039 void TGUtil::GetCCdf(const TFltPrV& PdfV, TFltPrV& CCdfV) {
00040   CCdfV = PdfV;
00041   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00042     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
00043 }
00044 
00045 void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) {
00046   CCdfV = PdfV;
00047   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00048     CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
00049 }
00050 
00051 TIntPrV TGUtil::GetCCdf(const TIntPrV& PdfV) {
00052   TIntPrV CCdfV;
00053   GetCCdf(PdfV, CCdfV);
00054   return CCdfV;
00055 }
00056 
00057 TFltPrV TGUtil::GetCCdf(const TFltPrV& PdfV) {
00058   TFltPrV CCdfV;
00059   GetCCdf(PdfV, CCdfV);
00060   return CCdfV;
00061 }
00062 
00063 void TGUtil::GetPdf(const TIntPrV& CdfV, TIntPrV& PdfV) {
00064   PdfV = CdfV;
00065   for (int i = PdfV.Len()-1; i > 0; i--) {
00066     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
00067 }
00068 
00069 void TGUtil::GetPdf(const TFltPrV& CdfV, TFltPrV& PdfV) {
00070   PdfV = CdfV;
00071   for (int i = PdfV.Len()-1; i > 0; i--) {
00072     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
00073 }
00074 
00075 void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) {
00076   PdfV = CdfV;
00077   for (int i = PdfV.Len()-1; i > 0; i--) {
00078     PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
00079 }
00080 
00081 void TGUtil::Normalize(TFltPrV& PdfV) {
00082   double Sum = 0.0;
00083   for (int i = 0; i < PdfV.Len(); i++) {
00084     Sum += PdfV[i].Val2; }
00085   if (Sum <= 0.0) { return; }
00086   for (int i = 0; i < PdfV.Len(); i++) {
00087     PdfV[i].Val2 /= Sum; }
00088 }
00089 
00090 void TGUtil::Normalize(TIntFltKdV& PdfV) {
00091   double Sum = 0.0;
00092   for (int i = 0; i < PdfV.Len(); i++) {
00093     Sum += PdfV[i].Dat; }
00094   if (Sum <= 0.0) { return; }
00095   for (int i = 0; i < PdfV.Len(); i++) {
00096     PdfV[i].Dat /= Sum; }
00097 }
00098 
00099 void TGUtil::MakeExpBins(const TFltPrV& XYValV, TFltPrV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
00100   TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
00101 }
00102 
00103 void TGUtil::MakeExpBins(const TFltKdV& XYValV, TFltKdV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
00104   TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
00105 }
00106 
00107 void TGUtil::MakeExpBins(const TFltV& YValV, TFltV& ExpYValV, const double& BinFactor) {
00108   ExpYValV.Clr(true);
00109   int prevI=0;
00110   for (int i = 0; i < YValV.Len(); ) {
00111     ExpYValV.Add(YValV[i]);
00112     i = int(i*BinFactor);
00113     if (i==prevI) { i++; }
00114     prevI = i;
00115   }
00116 }
00117 
00118 void TGUtil::MakeExpBins(const TIntV& YValV, TIntV& ExpYValV, const double& BinFactor) {
00119   ExpYValV.Clr(true);
00120   int prevI=0;
00121   for (int i = 0; i < YValV.Len(); ) {
00122     ExpYValV.Add(YValV[i]);
00123     i = int(i*BinFactor);
00124     if (i==prevI) { i++; }
00125     prevI = i;
00126   }
00127 }
00128 
00130 // String helper functions and utilities
00131 // get <TagNm>TagVal</TagNm>
00132 TChA& TStrUtil::GetXmlTagVal(TXmlLx& XmlLx, const TChA& TagNm) {
00133   static TChA TagVal;
00134   EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
00135   EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
00136   const TXmlLxSym NextSym = XmlLx.GetSym();
00137   TagVal = XmlLx.TxtChA;
00138   if (NextSym == xsyStr) {
00139     EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00140   } else {
00141     EAssertR(NextSym == xsyETag, TagNm); // empty tag
00142     //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00143   }
00144   EAssertR(XmlLx.TagNm == TagNm, TagNm);
00145   return TagVal;
00146 }
00147 
00148 // get <TagNm>TagVal</TagNm>
00149 void TStrUtil::GetXmlTagNmVal(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal) {
00150   EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
00151   TagNm = XmlLx.TagNm;
00152   const TXmlLxSym NextSym = XmlLx.GetSym();
00153   TagVal = XmlLx.TxtChA;
00154   if (NextSym == xsyStr) {
00155     EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00156   } else {
00157     EAssertR(NextSym == xsyETag, TagNm); // empty tag
00158     //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00159   }
00160 }
00161 
00162 // get <TagNm>*</TagNm> (can be many tags inbetween
00163 bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
00164   if (XmlLx.GetSym() != xsySTag) {
00165     return false; }
00166   TagVal.Clr();
00167   TagNm = XmlLx.TagNm;
00168   //const TXmlLxSym NextSym = XmlLx.GetSym();
00169   while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
00170     if (TakeTagNms) {
00171       TagVal += XmlLx.TxtChA; }
00172     else if (XmlLx.Sym == xsyStr) {
00173       TagVal += XmlLx.TxtChA; }
00174     XmlLx.GetSym();
00175   }
00176   return true;
00177   //if (NextSym == xsyStr) {
00178   //  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00179   //} else {
00180   //  EAssertR(NextSym == xsyETag, TagNm); // empty tag
00181   //  printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00182   //}
00183 }
00184 
00185 
00186 // http://www.ijs.si/fdfd/blah.html --> www.ijs.si
00187 TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
00188   int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
00189   if (EndSlash > 0) {
00190     const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
00191     if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
00192     else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
00193   } else {
00194     if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
00195     EndSlash = UrlChA.SearchCh('/', 0);
00196     if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
00197     else { return TChA(UrlChA).ToLc(); }
00198   }
00199 }
00200 // get domain name and also strip starting www.
00201 TChA TStrUtil::GetDomNm2(const TChA& UrlChA) {
00202   TChA Dom = GetDomNm(UrlChA);
00203   if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
00204   else { return Dom; }
00205 }
00206 
00207 int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') {
00208   const char *c = Url.CStr();
00209   int cnt = 0;
00210   while (*c && cnt != Count) {
00211     if (*c == Ch) { cnt++; }
00212     c++;
00213   }
00214   return int(c-Url.CStr()-1);
00215 }
00216 
00217 // get website (GetDomNm2 or blog url)
00218 TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) {
00219   TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
00220   // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
00221   if (DomNm == "blog.myspace.com") {
00222     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
00223   }
00224   // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
00225   // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
00226   // http://ameblo.jp/baptism/entry-10126216277.html
00227   // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
00228   // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
00229   // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
00230   // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
00231   // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
00232   // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
00233   // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
00234   // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
00235   // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
00236   // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
00237   // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
00238   // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
00239   // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
00240   // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
00241   // http://blogs.zdnet.com/hardware/?p=2391
00242   // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
00243   // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
00244   // http://blog.tv2.dk/ole.mork/entry254689.html
00245   // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
00246   // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
00247   // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
00248   if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
00249     || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
00250     || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
00251     || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
00252     || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
00253     || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
00254       return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
00255   }
00256   // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
00257   // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
00258   if (DomNm == "digg.com") {
00259     if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
00260       const int Url = PostUrlStr.SearchStr(";url=");
00261       if (Url != -1) {
00262         return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
00263     } else {
00264       return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
00265   }
00266   // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
00267   // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
00268   // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
00269   // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
00270   // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
00271   if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
00272     || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
00273     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00274   }
00275   // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
00276   if (DomNm=="feeds.feedburner.com") {
00277     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00278   }
00279   // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
00280   if (DomNm=="groups.google.com") {
00281     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00282   }
00283   // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
00284   if (DomNm=="news.google.com") { // redirect
00285     const int UrlPos = PostUrlStr.SearchStr("&url=");
00286     if (UrlPos != -1) {
00287       return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
00288   }
00289   // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
00290   if (DomNm == "bloggrevyen.no") { // redirect
00291     const int Http2 = PostUrlStr.SearchStr("/http://");
00292     if (Http2!=-1) {
00293       return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
00294   }
00295   //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
00296   //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
00297   if (DomNm.IsSuffix(".rd.yahoo.com")) {
00298     const int Http2 = PostUrlStr.SearchStr("/*");
00299     if (Http2!=-1) {
00300       return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
00301   }
00302   return DomNm;
00303 }
00304 
00306 bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
00307   UrlOut = UrlIn;
00308   if (StripEnd(UrlIn, "/", UrlOut)) {}
00309   else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
00310   else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
00311   else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
00312   if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
00313     // if UrlIn is relative url, try combine it with BaseUrl
00314     if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
00315       //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
00316       return false; }
00317     TChA Out;
00318     if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
00319     if (UrlIn[0] != '/') { Out.AddCh('/'); }
00320     Out += UrlOut;
00321     UrlOut = Out;
00322   }
00323   // http://www. --> http://
00324   if (UrlOut.IsPrefix("http://www.")) {
00325     UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
00326   }
00327   UrlOut.ToLc();
00328   return true;
00329 }
00330 
00331 bool TStrUtil::StripEnd(const TChA& Str, const TChA& SearchStr, TChA& NewStr) {
00332   const int StrLen = Str.Len();
00333   const int SearchStrLen = SearchStr.Len();
00334   if (StrLen < SearchStrLen) { return false; }
00335   for (int i = 0; i < SearchStrLen; i++) {
00336     if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
00337   }
00338   NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
00339   return true;
00340 }
00341 
00342 TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) {
00343   if (LongStr.Len() < MaxLen) { return LongStr; }
00344   TChA Str = LongStr.GetSubStr(0, MaxLen-1);
00345   Str += "...";
00346   return Str;
00347 }
00348 
00349 // space separated sequence of words, remove all punctuations, etc.
00350 TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) {
00351   char *b = (char *) ChA.CStr();
00352   while (*b && ! TCh::IsAlNum(*b)) { b++; }
00353   if (*b == 0) { return TChA(); }
00354   TChA OutChA(ChA.Len());
00355   char *e = b, tmp;
00356   while (*e) {
00357     b = e;
00358     while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
00359     if (b < e) {
00360       tmp = *e; *e=0;
00361       OutChA += b;  OutChA.AddCh(' ');
00362       *e = tmp;
00363     }
00364     while (*e && ! TCh::IsAlNum(*e)) { e++; }
00365     if (! *e) { break; }
00366   }
00367   OutChA.DelLastCh();  OutChA.ToLc();
00368   return OutChA;
00369 }
00370 
00371 // space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
00372 TChA TStrUtil::GetCleanStr(const TChA& ChA) {
00373   char *b = (char *) ChA.CStr();
00374   while (*b && ! TCh::IsAlNum(*b)) { b++; }
00375   if (*b == 0) { return TChA(); }
00376   TChA OutChA(ChA.Len());
00377   char *e = b;
00378   bool ws=false;
00379   while (*e) {
00380     while (*e && TCh::IsWs(*e)) { e++; ws=true; }
00381     if (! *e) { break; }
00382     if (ws) { OutChA.AddCh(' '); ws=false; }
00383     OutChA.AddCh(*e);
00384     e++;
00385   }
00386   //OutChA.ToLc();
00387   return OutChA;
00388 }
00389 int TStrUtil::CountWords(const TChA& ChA) {
00390   return CountWords(ChA.CStr());
00391 }
00392 
00393 int TStrUtil::CountWords(const char* CStr) {
00394   int WrdCnt = 1;
00395   for (const char *c = CStr; *c; c++) {
00396     if (TCh::IsWs(*c)) { WrdCnt++; }
00397   }
00398   return WrdCnt;
00399 }
00400 
00401 int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) {
00402   TChA Tmp;
00403   TVec<char *> WrdV;
00404   SplitWords(Tmp, WrdV);
00405   int SWordCnt = 0;
00406   for (int w = 0; w < WrdV.Len(); w++) {
00407     if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
00408   }
00409   return WrdV.Len() - SWordCnt;
00410 }
00411 
00412 int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) {
00413   WrdV.Clr(false);
00414   WrdV.Add(ChA.CStr());
00415   for (char *c = (char *) ChA.CStr(); *c; c++) {
00416     if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
00417       *c = 0;
00418       if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00419       WrdV.Add(c+1);
00420     }
00421   }
00422   return WrdV.Len();
00423 }
00424 
00425 int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) {
00426   WrdV.Clr(false);
00427   WrdV.Add(ChA.CStr());
00428   for (char *c = (char *) ChA.CStr(); *c; c++) {
00429     if (*c == Ch) {
00430       *c = 0;
00431       if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00432       WrdV.Add(c+1);
00433     }
00434   }
00435   if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00436   return WrdV.Len();
00437 }
00438 
00439 int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
00440   LineV.Clr(false);
00441   LineV.Add(ChA.CStr());
00442   bool IsChs=false;
00443   for (char *c = (char *) ChA.CStr(); *c; c++) {
00444     if (*c == '\n') {
00445       if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
00446       *c=0;
00447       if (SkipEmpty) {
00448         if (IsChs) { LineV.Add(c+1); }
00449       } else {
00450         LineV.Add(c+1);
00451       }
00452       IsChs=false;
00453     } else {
00454       IsChs=true;
00455     }
00456   }
00457   return LineV.Len();
00458 }
00459 
00460 int TStrUtil::SplitSentences(TChA& ChA, TVec<char *>& SentenceV) {
00461   SentenceV.Clr();
00462   const char *B = ChA.CStr();
00463   const char *E = B+ChA.Len();
00464   char *c = (char *) B;
00465   while (*c && TCh::IsWs(*c)) { c++; }
00466   if (*c) { SentenceV.Add(c); } else { return 0; }
00467   for (; c < E; c++) {
00468     if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
00469       if (c<E && *(c+1)=='"') { *c='"';  c++; } // blah." --> blah"
00470       if (c>=E) { continue; }
00471       *c=0;  c++;
00472       char *e = c-1;
00473       while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
00474       while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
00475       if (c<E) { SentenceV.Add(c); }
00476     }
00477   }
00478   return SentenceV.Len();
00479 }
00480 
00481 void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) {
00482   TextStr.Clr();
00483   char *StrB, *StrE;
00484   // use full page html: skip till <body>
00485   //PageHtmlStr = "<script fdsfs>  fsdfsd </script> jure";
00486   /*if (UseFullHtml) {
00487     StrB = PageHtmlStr.CStr();
00488     StrE = StrB+PageHtmlStr.Len();
00489     char * NewB = strstr(StrB, "<body>");
00490     if (NewB != NULL) { StrB = NewB+6; }
00491     char * NewE = strstr(StrB, "body>");
00492     if (NewE != NULL) {
00493       while (true) {
00494         char *E=strstr(NewE+4, "body>");
00495         if (E == NULL) { break; }  NewE = E; }
00496       StrE = NewE;
00497     }
00498   } else {  // only extracted post html*/
00499   StrB = (char *) HtmlStr.CStr();
00500   StrE = (char *) StrB+HtmlStr.Len(); //}
00501   for (char *e = StrB; e < StrE; ) {
00502     char* b = e;
00503     while (e<StrE && *e != '<') { e++; }
00504     // copy text
00505     char tmp=*e;  *e = 0;
00506     TextStr+= b; TextStr.AddCh(' ');  *e = tmp;
00507     if (e >= StrE) { return; }
00508     // if start of a comment: skip
00509     if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
00510       e += 3;
00511       while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
00512       e++;  continue;
00513     }
00514     // if "<script" then skip
00515     if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
00516       e += 5;
00517       while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
00518       e++;  continue;
00519     }
00520     // skip to end of tag
00521     while (e < StrE && *e != '>') { e++; }
00522     if (e>=StrE) { return; }
00523     e++;
00524   }
00525 }
00526 
00527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
00528   int AlNumCnt=0, ChCnt=0;
00529   for (const char *c = Str.CStr(); *c; c++) {
00530     if (TCh::IsWs(*c)) { continue; }
00531     if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
00532     ChCnt++;
00533   }
00534   if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
00535   return false;
00536 }
00537 
00538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
00539   const int NotWId = -1;
00540   TChA ChA(CStr);
00541   TVec<char *> WrdV;
00542   TInt WId;
00543   TStrUtil::SplitWords(ChA, WrdV);
00544   WIdV.Clr(false);
00545   for (int w = 0; w < WrdV.Len(); w++) {
00546     if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
00547     else { WIdV.Add(NotWId); }
00548   }
00549 }
00550 
00551 // and words to StrH and get a vector of word ids
00552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
00553   TChA ChA(CStr);
00554   TVec<char *> WrdV;
00555   TInt WId;
00556   TStrUtil::SplitWords(ChA, WrdV);
00557   WIdV.Clr(false);
00558   for (int w = 0; w < WrdV.Len(); w++) {
00559     WIdV.Add(StrH.AddDatId(WrdV[w]));
00560   }
00561 }
00562 
00563 // Parse time in various formats:
00564 //   10:16, 16 Sep 2004
00565 //   10:20, 2004 Sep 16
00566 //   2005-07-07 20:30:35
00567 //   23:24:07, 2005-07-10
00568 //   9 July 2005 14:38
00569 //   21:16, July 9, 2005
00570 //   06:02, 10 July 2005
00571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) {
00572   static TStrV MonthV1, MonthV2;
00573   if (MonthV1.Empty()) {
00574     TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
00575     TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
00576   }
00577   TChA Tmp(TmStr);
00578   Tmp.ToLc();
00579   TVec<char *> WrdV;
00580   const char* End = Tmp.CStr()+Tmp.Len();
00581   int Col = -1, Cols=0;
00582   for (char *b = Tmp.CStr(); b <End; ) {
00583     WrdV.Add(b);
00584     while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
00585     if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++;  }
00586     *b=0; b++;
00587     while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
00588   }
00589   if (Cols == 2) {
00590     if (Col+1 >= WrdV.Len()) { return false; }
00591     WrdV.Del(Col+1);
00592   }
00593   if (Col<1) { return false; }
00594   const int Hr = atoi(WrdV[Col-1]);
00595   const int Min = atoi(WrdV[Col]);
00596   WrdV.Del(Col);  WrdV.Del(Col-1);
00597   if (WrdV.Len() != 3) { return false; }
00598   int y=0,m=1,d=2, Mon=-1;
00599   if (TCh::IsAlpha(WrdV[0][0])) {
00600     y=2; m=0; d=1;
00601   } else if (TCh::IsAlpha(WrdV[1][0])) {
00602     y=2; m=1; d=0;
00603   } else if (TCh::IsAlpha(WrdV[2][0])) {
00604     y=0; m=2; d=1;
00605   } else {
00606     y=0; m=1; d=2;
00607     Mon = atoi(WrdV[m]);
00608   }
00609   int Day = atoi(WrdV[d]);
00610   if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
00611   if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
00612   if (Mon == 0) { return false; }
00613   int Year = atoi(WrdV[y]);
00614   if (Day > Year) { ::Swap(Day, Year); }
00615   //printf("%d-%02d-%02d  %02d:%02d\n", Year, Mon, Day, Hr, Min);
00616   Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
00617   return true;
00618 }
00619 
00620 // Standardize first and lastnames into <last_name>_<first name innitial>
00621 TStr TStrUtil::GetStdName(TStr AuthorName) {
00622   TStr StdName;
00623   AuthorName.ToLc();
00624   AuthorName.ChangeChAll('\n', ' ');
00625   AuthorName.ChangeChAll('.', ' ');
00626   // if there is a number in the name, remove it and everything after it
00627   int i, pos = 0;
00628   while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
00629     pos++; }
00630   if (pos < AuthorName.Len()) {
00631     AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
00632   if (AuthorName.Empty()) { return TStr::GetNullStr(); }
00633 
00634   // replace everything after '('
00635   int b = AuthorName.SearchCh('(');
00636   if (b != -1) {
00637     AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
00638   // skip if contains ')'
00639   if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
00640   // skip if it is not a name
00641   if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
00642    || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
00643     return TStr::GetNullStr();
00644   }
00645   // remove all non-letters (latex tags, ...)
00646   TChA NewName;
00647   for (i = 0; i < AuthorName.Len(); i++) {
00648     const char Ch = AuthorName[i];
00649     if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
00650   }
00651   StdName = NewName;  StdName.ToTrunc();
00652   TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
00653   // too short -- not a name
00654   if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
00655   if (AuthNmV.Len() < 2) return TStr::GetNullStr();
00656 
00657   const TStr LastNm = AuthNmV.Last();
00658   if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
00659 
00660   IAssert(isalpha(AuthNmV[0][0]));
00661   return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
00662 }
00663 
00664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) {
00665   AuthorNames.ChangeChAll('\n', ' ');
00666   AuthorNames.ToLc();
00667   // split into author names
00668   TStrV AuthV, TmpV, Tmp2V;
00669   // split on 'and'
00670   AuthorNames.SplitOnStr(" and ", TmpV);
00671   int i;
00672   for (i = 0; i < TmpV.Len(); i++) {
00673     TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
00674   // split on '&'
00675   TmpV = AuthV;  AuthV.Clr();
00676   for (i = 0; i < TmpV.Len(); i++) {
00677     TmpV[i].SplitOnAllCh('&', Tmp2V);  AuthV.AddV(Tmp2V); }
00678   // split on ','
00679   TmpV = AuthV;  AuthV.Clr();
00680   for (i = 0; i < TmpV.Len(); i++) {
00681     TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
00682   // split on ';'
00683   TmpV = AuthV;  AuthV.Clr();
00684   for (i = 0; i < TmpV.Len(); i++) {
00685     TmpV[i].SplitOnAllCh(';', Tmp2V);  AuthV.AddV(Tmp2V); }
00686   // standardize names
00687   StdNameV.Clr();
00688   //printf("\n*** %s\n", AuthorNames.CStr());
00689   for (i = 0; i < AuthV.Len(); i++) {
00690     TStr StdName = GetStdName(AuthV[i]);
00691     if (! StdName.Empty()) {
00692       //printf("\t%s  ==>  %s\n", AuthV[i].CStr(), StdName.CStr());
00693       StdNameV.Add(StdName);
00694     }
00695   }
00696 }