SNAP Library , User Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TWebPg Class Reference

#include <html.h>

List of all members.

Public Member Functions

 TWebPg ()
 TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp)
 ~TWebPg ()
 TWebPg (TSIn &)
void Save (TSOut &)
TWebPgoperator= (const TWebPg &)
int GetUrls () const
TStr GetUrlStr (const int &UrlN=-1) const
PUrl GetUrl (const int &UrlN=-1) const
int GetIps () const
TStr GetIpNum (const int &IpN=-1) const
PHttpResp GetHttpResp () const
TStr GetHttpHdStr () const
TStr GetHttpBodyAsStr () const
void GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const
void GetOutUrlV (TUrlV &OutUrlV) const
void GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const
void PutFetchMSecs (const uint64 &_FetchMSecs)
uint64 GetFetchMSecs () const
void SaveAsHttpBody (const TStr &FNm) const
void SaveAsHttp (const TStr &FNm) const
bool IsTxt () const

Static Public Member Functions

static PWebPg New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp)
static PWebPg New (const TStrV &UrlStrV, const PHttpResp &HttpResp)
static PWebPg New (const TStr &UrlStr, const PHttpResp &HttpResp)
static PWebPg Load (TSIn &)

Private Attributes

TCRef CRef
TStrV UrlStrV
TStrV IpNumV
PHttpResp HttpResp
uint64 FetchMSecs

Friends

class TPt< TWebPg >

Detailed Description

Definition at line 330 of file html.h.


Constructor & Destructor Documentation

TWebPg::TWebPg ( ) [inline]

Definition at line 337 of file html.h.

: UrlStrV(), IpNumV(), HttpResp(){}
TWebPg::TWebPg ( const TStrV _UrlStrV,
const TStrV _IpNumV,
const PHttpResp _HttpResp 
) [inline]

Definition at line 338 of file html.h.

                                                                                 :
    UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){}
TWebPg::~TWebPg ( ) [inline]

Definition at line 347 of file html.h.

{}
TWebPg::TWebPg ( TSIn ) [inline]

Definition at line 348 of file html.h.

{Fail;}

Member Function Documentation

uint64 TWebPg::GetFetchMSecs ( ) const [inline]

Definition at line 377 of file html.h.

{return FetchMSecs;}
TStr TWebPg::GetHttpBodyAsStr ( ) const [inline]

Definition at line 368 of file html.h.

{return GetHttpResp()->GetBodyAsStr();}
TStr TWebPg::GetHttpHdStr ( ) const [inline]

Definition at line 367 of file html.h.

{return GetHttpResp()->GetHdStr();}
PHttpResp TWebPg::GetHttpResp ( ) const [inline]

Definition at line 366 of file html.h.

{return HttpResp;}
TStr TWebPg::GetIpNum ( const int &  IpN = -1) const [inline]

Definition at line 363 of file html.h.

                                         {
    if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}}
int TWebPg::GetIps ( ) const [inline]

Definition at line 362 of file html.h.

{return IpNumV.Len();}
void TWebPg::GetOutDescUrlStrKdV ( TStrKdV OutDescUrlStrKdV) const

Definition at line 1258 of file html.cpp.

                                                                {
  // create outgoing url vector
  OutDescUrlStrKdV.Clr();
  // take interesting web-page components
  TStr UrlStr=GetUrlStr();
  TStr HtmlStr=GetHttpBodyAsStr();
  // prepare html parsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
  // traverse html documents
  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
  int TokN=0; int Toks=HtmlDoc->GetToks();
  while (TokN<Toks){
    Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
    if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
      TStr RelUrlStr;
      if (Tok->IsUrlTok(RelUrlStr)){
        PUrl Url=TUrl::New(RelUrlStr, UrlStr);
        if (Url->IsOk()){
          TChA DescChA;
          while (TokN<Toks){
            Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
            if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
              break;
            } else {
              if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
                if (!DescChA.Empty()){DescChA+=' ';}
                DescChA+=TokStr;
              }
            }
          }
          OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
        }
      }
    }
  }
}
void TWebPg::GetOutUrlV ( TUrlV OutUrlV,
TUrlV OutRedirUrlV 
) const

Definition at line 1230 of file html.cpp.

                                                                 {
  // create outgoing url vector
  OutUrlV.Clr(); OutRedirUrlV.Clr();
  // take interesting web-page components
  TStr UrlStr=GetUrlStr();
  TStr HtmlStr=GetHttpBodyAsStr();
  // prepare html parsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
  PHtmlTok Tok;
  // traverse html
  for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
    PHtmlTok Tok=HtmlDoc->GetTok(TokN);
    if (Tok->GetSym()==hsyBTag){
      TStr RelUrlStr;
      if (Tok->IsUrlTok(RelUrlStr)){
        PUrl Url=TUrl::New(RelUrlStr, UrlStr);
        if (Url->IsOk(usHttp)){
          OutUrlV.Add(Url);
          if (Tok->IsRedirUrlTok()){
            OutRedirUrlV.Add(Url);
          }
        }
      }
    }
  }
}
void TWebPg::GetOutUrlV ( TUrlV OutUrlV) const [inline]

Definition at line 371 of file html.h.

                                        {
    TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
PUrl TWebPg::GetUrl ( const int &  UrlN = -1) const [inline]

Definition at line 357 of file html.h.

                                        {
    TStr UrlStr;
    if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];}
    return TUrl::New(UrlStr);}
int TWebPg::GetUrls ( ) const [inline]

Definition at line 354 of file html.h.

{return UrlStrV.Len();}
TStr TWebPg::GetUrlStr ( const int &  UrlN = -1) const [inline]

Definition at line 355 of file html.h.

                                           {
    if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}}
bool TWebPg::IsTxt ( ) const

Definition at line 1310 of file html.cpp.

                         {
  if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){
    TStr Str=HttpResp->GetBodyAsStr();
    int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
    while ((ChN<100)&&(ChN<StrLen)){
      char Ch=Str[ChN++];
      if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
        PrintChs++;}
    }
    double PrintPrb=double(PrintChs)/double(ChN+1);
    return PrintPrb>0.9;
  } else {
    return false;
  }
}
static PWebPg TWebPg::Load ( TSIn ) [inline, static]

Definition at line 349 of file html.h.

{Fail; return NULL;}
static PWebPg TWebPg::New ( const TStrV UrlStrV,
const TStrV IpNumV,
const PHttpResp HttpResp 
) [inline, static]

Definition at line 340 of file html.h.

                                                                                         {
    return new TWebPg(UrlStrV, IpNumV, HttpResp);}
static PWebPg TWebPg::New ( const TStrV UrlStrV,
const PHttpResp HttpResp 
) [inline, static]

Definition at line 342 of file html.h.

                                                                    {
    return new TWebPg(UrlStrV, TStrV(), HttpResp);}
static PWebPg TWebPg::New ( const TStr UrlStr,
const PHttpResp HttpResp 
) [inline, static]

Definition at line 344 of file html.h.

                                                                  {
    TStrV UrlStrV; UrlStrV.Add(UrlStr);
    return new TWebPg(UrlStrV, TStrV(), HttpResp);}
TWebPg& TWebPg::operator= ( const TWebPg ) [inline]

Definition at line 352 of file html.h.

{Fail; return *this;}
void TWebPg::PutFetchMSecs ( const uint64 _FetchMSecs) [inline]

Definition at line 376 of file html.h.

{FetchMSecs=_FetchMSecs;}
void TWebPg::Save ( TSOut ) [inline]

Definition at line 350 of file html.h.

{Fail;}
void TWebPg::SaveAsHttp ( const TStr FNm) const

Definition at line 1303 of file html.cpp.

                                             {
  // create output file
  PSOut SOut=TFOut::New(FNm);
  // save http
  HttpResp->SaveTxt(SOut);
}
void TWebPg::SaveAsHttpBody ( const TStr FNm) const

Definition at line 1296 of file html.cpp.

                                                 {
  // create output file
  PSOut SOut=TFOut::New(FNm);
  // save http-body
  HttpResp->SaveBody(SOut);
}

Friends And Related Function Documentation

friend class TPt< TWebPg > [friend]

Definition at line 330 of file html.h.


Member Data Documentation

TCRef TWebPg::CRef [private]

Definition at line 330 of file html.h.

Definition at line 335 of file html.h.

Definition at line 334 of file html.h.

TStrV TWebPg::IpNumV [private]

Definition at line 333 of file html.h.

Definition at line 332 of file html.h.


The documentation for this class was generated from the following files: