SNAP Library , User Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
ss.cpp
Go to the documentation of this file.
00001 
00002 // Spread-Sheet
00003 TStr& TSs::At(const int& X, const int& Y){
00004 //  Fail;
00005   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00006   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00007   return CellStrVV[Y]->V[X];
00008 }
00009 
00010 void TSs::PutVal(const int& X, const int& Y, const TStr& Str){
00011   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00012   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00013   CellStrVV[Y]->V[X]=Str;
00014 }
00015 
00016 TStr TSs::GetVal(const int& X, const int& Y) const {
00017   if ((0<=Y)&&(Y<CellStrVV.Len())){
00018     if ((0<=X)&&(X<CellStrVV[Y]->Len())){
00019       return CellStrVV[Y]->V[X];
00020     } else {
00021       return TStr::GetNullStr();
00022     }
00023   } else {
00024     return TStr::GetNullStr();
00025   }
00026 }
00027 
00028 int TSs::GetXLen() const {
00029   if (CellStrVV.Len()==0){
00030     return 0;
00031   } else {
00032     int MxXLen=CellStrVV[0]->Len();
00033     for (int Y=1; Y<CellStrVV.Len(); Y++){
00034       MxXLen=TInt::GetMx(MxXLen, CellStrVV[Y]->Len());}
00035     return MxXLen;
00036   }
00037 }
00038 
00039 int TSs::GetXLen(const int& Y) const {
00040   if ((0<=Y)&&(Y<CellStrVV.Len())){
00041     return CellStrVV[Y]->Len();
00042   } else {
00043     return 0;
00044   }
00045 }
00046 
00047 int TSs::GetYLen() const {
00048   return CellStrVV.Len();
00049 }
00050 
00051 int TSs::SearchX(const int& Y, const TStr& Str) const {
00052   return CellStrVV[Y]->V.SearchForw(Str);
00053 }
00054 
00055 int TSs::SearchY(const int& X, const TStr& Str) const {
00056   int YLen=GetYLen();
00057   for (int Y=0; Y<YLen; Y++){
00058      if (Str==GetVal(X, Y)){return Y;}}
00059   return -1;
00060 }
00061 
00062 void TSs::DelX(const int& X){
00063   int YLen=GetYLen();
00064   for (int Y=0; Y<YLen; Y++){
00065     CellStrVV[Y]->V.Del(X);
00066   }
00067 }
00068 
00069 void TSs::DelY(const int& Y){
00070   CellStrVV.Del(Y);
00071 }
00072 
00073 int TSs::GetFldX(const TStr& FldNm, const TStr& NewFldNm, const int& Y) const {
00074   if (GetYLen()>Y){
00075     int XLen=GetXLen(Y);
00076     for (int X=0; X<XLen; X++){
00077       if (GetVal(X, Y).GetTrunc()==FldNm){
00078         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00079         return X;
00080       }
00081     }
00082     return -1;
00083   } else {
00084     return -1;
00085   }
00086 }
00087 
00088 int TSs::GetFldY(const TStr& FldNm, const TStr& NewFldNm, const int& X) const {
00089   for (int Y=0; Y<GetYLen(); Y++){
00090     if (GetXLen(Y)>X){
00091       if (GetVal(X, Y).GetTrunc()==FldNm){
00092         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00093         return Y;
00094       }
00095     }
00096   }
00097   return -1;
00098 }
00099 
00100 PSs TSs::LoadTxt(
00101  const TSsFmt& SsFmt, const TStr& FNm,
00102  const PNotify& Notify, const bool& IsExcelEoln,
00103  const int& MxY, const TIntV& AllowedColNV, const bool& IsQStr){
00104   TNotify::OnNotify(Notify, ntInfo, TStr("Loading File ")+FNm+" ...");
00105   PSIn SIn=TFIn::New(FNm);
00106   PSs Ss=TSs::New();
00107   if (!SIn->Eof()){
00108     int X=0; int Y=0; int PrevX=-1; int PrevY=-1;
00109     char Ch=SIn->GetCh(); TChA ChA;
00110     while (!SIn->Eof()){
00111       // compose value
00112       ChA.Clr();
00113       if (IsQStr&&(Ch=='"')){
00114         // quoted string ('""' sequence means '"')
00115         Ch=SIn->GetCh();
00116         forever {
00117           while ((!SIn->Eof())&&(Ch!='"')){
00118             ChA+=Ch; Ch=SIn->GetCh();}
00119           if (Ch=='"'){
00120             Ch=SIn->GetCh();
00121             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00122             else {break;}
00123           }
00124         }
00125       } else {
00126         if (SsFmt==ssfTabSep){
00127           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00128             ChA+=Ch; Ch=SIn->GetCh();
00129           }
00130         } else
00131         if (SsFmt==ssfCommaSep){
00132           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00133             ChA+=Ch; Ch=SIn->GetCh();
00134           }
00135         } else
00136         if (SsFmt==ssfSemicolonSep){
00137           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00138             ChA+=Ch; Ch=SIn->GetCh();
00139           }
00140         } else
00141         if (SsFmt==ssfVBar){
00142           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00143             ChA+=Ch; Ch=SIn->GetCh();
00144           }
00145         } else
00146         if (SsFmt==ssfSpaceSep){
00147           while ((!SIn->Eof())&&(Ch!=' ')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00148             ChA+=Ch; Ch=SIn->GetCh();
00149           }
00150         } else {
00151           Fail;
00152         }
00153       }
00154       // add new line if neccessary
00155       if (PrevY!=Y){
00156         if ((MxY!=-1)&&(Ss->CellStrVV.Len()==MxY)){break;}
00157         Ss->CellStrVV.Add(TStrVP::New()); PrevY=Y;
00158         int Recs=Ss->CellStrVV.Len();
00159         if (Recs%1000==0){
00160           TNotify::OnStatus(Notify, TStr::Fmt("  %d\r", Recs));}
00161       }
00162       // add value to spreadsheet
00163       if (AllowedColNV.Empty()||AllowedColNV.IsIn(X)){
00164         Ss->CellStrVV[Y]->V.Add(ChA); 
00165       }
00166       // process delimiters
00167       if (SIn->Eof()){
00168         break;
00169       } else
00170       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00171         X++; Ch=SIn->GetCh();
00172       } else
00173       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00174         X++; Ch=SIn->GetCh();
00175       } else
00176       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00177         X++; Ch=SIn->GetCh();
00178       } else
00179       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00180         X++; Ch=SIn->GetCh();
00181       } else
00182       if ((SsFmt==ssfSpaceSep)&&(Ch==' ')){
00183         X++; Ch=SIn->GetCh();
00184       } else
00185       if (Ch=='\r'){
00186         if ((PrevX!=-1)&&(X!=PrevX)){
00187           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00188         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00189         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00190         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00191       } else
00192       if (Ch=='\n'){
00193         if ((PrevX!=-1)&&(X!=PrevX)){
00194           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00195         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00196         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00197         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00198       } else {
00199         Fail;
00200       }
00201     }
00202   }
00203   int Recs=Ss->CellStrVV.Len();
00204   TNotify::OnNotify(Notify, ntInfo, TStr::Fmt("  %d records read.", Recs));
00205   TNotify::OnNotify(Notify, ntInfo, "... Done.");
00206   return Ss;
00207 }
00208 
00209 void TSs::SaveTxt(const TStr& FNm, const PNotify&) const {
00210   PSOut SOut=TFOut::New(FNm);
00211   for (int Y=0; Y<CellStrVV.Len(); Y++){
00212     for (int X=0; X<CellStrVV[Y]->Len(); X++){
00213       if (X>0){SOut->PutCh('\t');}
00214       TStr Str=CellStrVV[Y]->V[X];
00215       TChA ChA(Str);
00216       for (int ChN=0; ChN<ChA.Len(); ChN++){
00217         char Ch=ChA[ChN];
00218         if ((Ch=='\t')||(Ch=='\r')||(Ch=='\n')){
00219           ChA.PutCh(ChN, ' ');
00220         }
00221       }
00222       SOut->PutStr(ChA);
00223     }
00224     SOut->PutCh('\r'); SOut->PutCh('\n');
00225   }
00226 }
00227 
00228 void TSs::LoadTxtFldV(
00229  const TSsFmt& SsFmt, const PSIn& SIn, char& Ch,
00230  TStrV& FldValV, const bool& IsExcelEoln, const bool& IsQStr){
00231   if (!SIn->Eof()){
00232     FldValV.Clr(false); int X=0;
00233     if (Ch==TCh::NullCh){Ch=SIn->GetCh();}
00234     TChA ChA;
00235     while (!SIn->Eof()){
00236       // compose value
00237       ChA.Clr();
00238       if (IsQStr&&(Ch=='"')){
00239         // quoted string ('""' sequence means '"')
00240         Ch=SIn->GetCh();
00241         forever {
00242           while ((!SIn->Eof())&&(Ch!='"')){
00243             ChA+=Ch; Ch=SIn->GetCh();}
00244           if (Ch=='"'){
00245             Ch=SIn->GetCh();
00246             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00247             else {break;}
00248           }
00249         }
00250       } else {
00251         if (SsFmt==ssfTabSep){
00252           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&
00253            ((Ch!='\n')||IsExcelEoln)){
00254             ChA+=Ch; Ch=SIn->GetCh();
00255           }
00256           if ((!ChA.Empty())&&(ChA.LastCh()=='\"')){
00257             ChA.Pop();}
00258         } else
00259         if (SsFmt==ssfCommaSep){
00260           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&
00261            ((Ch!='\n')||IsExcelEoln)){
00262             ChA+=Ch; Ch=SIn->GetCh();
00263           }
00264         } else
00265         if (SsFmt==ssfSemicolonSep){
00266           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&
00267            ((Ch!='\n')||IsExcelEoln)){
00268             ChA+=Ch; Ch=SIn->GetCh();
00269           }
00270         } else
00271         if (SsFmt==ssfVBar){
00272           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&
00273            ((Ch!='\n')||IsExcelEoln)){
00274             ChA+=Ch; Ch=SIn->GetCh();
00275           }
00276         } else {
00277           Fail;
00278         }
00279       }
00280       // add value to spreadsheet
00281       ChA.Trunc();
00282       FldValV.Add(ChA);
00283       // process delimiters
00284       if (SIn->Eof()){
00285         break;
00286       } else
00287       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00288         X++; Ch=SIn->GetCh();
00289       } else
00290       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00291         X++; Ch=SIn->GetCh();
00292       } else
00293       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00294         X++; Ch=SIn->GetCh();
00295       } else
00296       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00297         X++; Ch=SIn->GetCh();
00298       } else
00299       if (Ch=='\r'){
00300         Ch=SIn->GetCh();
00301         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00302         break;
00303       } else
00304       if (Ch=='\n'){
00305         X=0; Ch=SIn->GetCh();
00306         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00307         break;
00308       } else {
00309         Fail;
00310       }
00311     }
00312   }
00313 }
00314 
00315 TSsFmt TSs::GetSsFmtFromStr(const TStr& SsFmtNm){
00316   TStr LcSsFmtNm=SsFmtNm.GetLc();
00317   if (LcSsFmtNm=="tab"){return ssfTabSep;}
00318   else if (LcSsFmtNm=="comma"){return ssfCommaSep;}
00319   else if (LcSsFmtNm=="semicolon"){return ssfSemicolonSep;}
00320   else if (LcSsFmtNm=="vbar"){return ssfVBar;}
00321   else if (LcSsFmtNm=="space"){return ssfSpaceSep;}
00322   else if (LcSsFmtNm=="white"){return ssfWhiteSep;}
00323   else {return ssfUndef;}
00324 }
00325 
00326 TStr TSs::GetStrFromSsFmt(const TSsFmt& SsFmt){
00327   switch (SsFmt){
00328     case ssfTabSep: return "tab";
00329     case ssfCommaSep: return "comma";
00330     case ssfSemicolonSep: return "semicolon";
00331     case ssfVBar: return "vbar";
00332     case ssfSpaceSep: return "space";
00333     case ssfWhiteSep: return "white";
00334     default: return "undef";
00335   }
00336 }
00337 
00338 TStr TSs::GetSsFmtNmVStr(){
00339   TChA ChA;
00340   ChA+='(';
00341   ChA+="tab"; ChA+=", ";
00342   ChA+="comma"; ChA+=", ";
00343   ChA+="semicolon"; ChA+=", ";
00344   ChA+="space"; ChA+=", ";
00345   ChA+="white"; ChA+=")";
00346   return ChA;
00347 }
00348 
00350 // Fast-Spread-Sheet-Parser
00351 TSsParser::TSsParser(const TStr& FNm, const TSsFmt _SsFmt, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(_SsFmt), 
00352  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), FldV(), FInPt(NULL) {
00353   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00354   else { FInPt = TFIn::New(FNm); }
00355   //Bf = new char [BfLen];
00356   switch(SsFmt) {
00357     case ssfTabSep : SplitCh = '\t'; break;
00358     case ssfCommaSep : SplitCh = ','; break;
00359     case ssfSemicolonSep : SplitCh = ';'; break;
00360     case ssfVBar : SplitCh = '|'; break;
00361     case ssfSpaceSep : SplitCh = ' '; break;
00362     case ssfWhiteSep: SplitCh = ' '; break;
00363     default: FailR("Unknown separator character.");
00364   }
00365 }
00366 
00367 TSsParser::TSsParser(const TStr& FNm, const char& Separator, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(ssfSpaceSep), 
00368  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), FldV(), FInPt(NULL) {
00369   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00370   else { FInPt = TFIn::New(FNm); }
00371   SplitCh = Separator;
00372 }
00373 
00374 TSsParser::~TSsParser() {
00375   //if (Bf != NULL) { delete [] Bf; }
00376 }
00377 
00378 bool TSsParser::Next() { // split on SplitCh
00379   FldV.Clr(false);
00380   LineStr.Clr();
00381   FldV.Clr();
00382   LineCnt++;
00383   if (! FInPt->GetNextLn(LineStr)) { return false; }
00384   if (SkipCmt && LineStr.Len()>0 && LineStr[0]=='#') { return Next(); }
00385 
00386   char* cur = LineStr.CStr();
00387   if (SkipLeadBlanks) { // skip leadning blanks
00388     while (*cur && TCh::IsWs(*cur)) { cur++; }
00389   }
00390   char *last = cur;
00391   while (*cur) {
00392     if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } } 
00393     else { while (*cur && *cur!=SplitCh) { cur++; } }
00394     if (*cur == 0) { break; }
00395     *cur = 0;  cur++;
00396     FldV.Add(last);  last = cur;
00397     if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
00398   }
00399   FldV.Add(last);  // add last field
00400   if (SkipEmptyFld && FldV.Empty()) { return Next(); } // skip empty lines
00401   return true; 
00402 }
00403 
00404 void TSsParser::ToLc() {
00405   for (int f = 0; f < FldV.Len(); f++) {
00406     for (char *c = FldV[f]; *c; c++) {
00407       *c = tolower(*c); }
00408   }
00409 }
00410 
00411 bool TSsParser::GetInt(const int& FldN, int& Val) const {
00412   // parsing format {ws} [+/-] +{ddd}
00413   int _Val = -1;
00414   bool Minus=false;
00415   const char *c = GetFld(FldN);
00416   while (TCh::IsWs(*c)) { c++; }
00417   if (*c=='-') { Minus=true; c++; }
00418   if (! TCh::IsNum(*c)) { return false; }
00419   _Val = TCh::GetNum(*c);  c++;
00420   while (TCh::IsNum(*c)){ 
00421     _Val = 10 * _Val + TCh::GetNum(*c); 
00422     c++; 
00423   }
00424   if (Minus) { _Val = -_Val; }
00425   if (*c != 0) { return false; }
00426   Val = _Val;
00427   return true;
00428 }
00429 
00430 bool TSsParser::GetFlt(const int& FldN, double& Val) const {
00431   // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
00432   const char *c = GetFld(FldN);
00433   while (TCh::IsWs(*c)) { c++; }
00434   if (*c=='+' || *c=='-') { c++; }
00435   if (! TCh::IsNum(*c) && *c!='.') { return false; }
00436   while (TCh::IsNum(*c)) { c++; }
00437   if (*c == '.') {
00438     c++;
00439     while (TCh::IsNum(*c)) { c++; }
00440   }
00441   if (*c=='e' || *c == 'E') {
00442     c++;
00443     if (*c == '+' || *c == '-' ) { c++; }
00444     if (! TCh::IsNum(*c)) { return false; }
00445     while (TCh::IsNum(*c)) { c++; }
00446   }
00447   if (*c != 0) { return false; }
00448   Val = atof(GetFld(FldN));
00449   return true;
00450 }
00451 
00452 const char* TSsParser::DumpStr() const {
00453   static TChA ChA(10*1024);
00454   ChA.Clr();
00455   for (int i = 0; i < FldV.Len(); i++) {
00456     ChA += TStr::Fmt("  %d: '%s'\n", i, FldV[i]);
00457   }
00458   return ChA.CStr();
00459 }
00460