SNAP Library 2.1, Developer Reference
2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <html.h>
Public Member Functions | |
TWebPg () | |
TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp) | |
~TWebPg () | |
TWebPg (TSIn &) | |
void | Save (TSOut &) |
TWebPg & | operator= (const TWebPg &) |
int | GetUrls () const |
TStr | GetUrlStr (const int &UrlN=-1) const |
PUrl | GetUrl (const int &UrlN=-1) const |
int | GetIps () const |
TStr | GetIpNum (const int &IpN=-1) const |
PHttpResp | GetHttpResp () const |
TStr | GetHttpHdStr () const |
TStr | GetHttpBodyAsStr () const |
void | GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const |
void | GetOutUrlV (TUrlV &OutUrlV) const |
void | GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const |
void | PutFetchMSecs (const uint64 &_FetchMSecs) |
uint64 | GetFetchMSecs () const |
void | SaveAsHttpBody (const TStr &FNm) const |
void | SaveAsHttp (const TStr &FNm) const |
bool | IsTxt () const |
Static Public Member Functions | |
static PWebPg | New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp) |
static PWebPg | New (const TStrV &UrlStrV, const PHttpResp &HttpResp) |
static PWebPg | New (const TStr &UrlStr, const PHttpResp &HttpResp) |
static PWebPg | Load (TSIn &) |
Private Attributes | |
TCRef | CRef |
TStrV | UrlStrV |
TStrV | IpNumV |
PHttpResp | HttpResp |
uint64 | FetchMSecs |
Friends | |
class | TPt< TWebPg > |
TWebPg::TWebPg | ( | ) | [inline] |
TWebPg::TWebPg | ( | const TStrV & | _UrlStrV, |
const TStrV & | _IpNumV, | ||
const PHttpResp & | _HttpResp | ||
) | [inline] |
TWebPg::~TWebPg | ( | ) | [inline] |
uint64 TWebPg::GetFetchMSecs | ( | ) | const [inline] |
Definition at line 377 of file html.h.
{return FetchMSecs;}
TStr TWebPg::GetHttpBodyAsStr | ( | ) | const [inline] |
Definition at line 368 of file html.h.
Referenced by GetOutDescUrlStrKdV(), and GetOutUrlV().
{return GetHttpResp()->GetBodyAsStr();}
TStr TWebPg::GetHttpHdStr | ( | ) | const [inline] |
Definition at line 367 of file html.h.
{return GetHttpResp()->GetHdStr();}
PHttpResp TWebPg::GetHttpResp | ( | ) | const [inline] |
TStr TWebPg::GetIpNum | ( | const int & | IpN = -1 | ) | const [inline] |
int TWebPg::GetIps | ( | ) | const [inline] |
void TWebPg::GetOutDescUrlStrKdV | ( | TStrKdV & | OutDescUrlStrKdV | ) | const |
Definition at line 1258 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::Empty(), GetHttpBodyAsStr(), THtmlDoc::GetTok(), THtmlDoc::GetToks(), TUrl::GetUrlStr(), GetUrlStr(), hsyBTag, hsyETag, hsyNum, hsySSym, hsyStr, TUrl::IsOk(), and New().
{ // create outgoing url vector OutDescUrlStrKdV.Clr(); // take interesting web-page components TStr UrlStr=GetUrlStr(); TStr HtmlStr=GetHttpBodyAsStr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); // traverse html documents PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; int TokN=0; int Toks=HtmlDoc->GetToks(); while (TokN<Toks){ Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, UrlStr); if (Url->IsOk()){ TChA DescChA; while (TokN<Toks){ Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){ break; } else { if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){ if (!DescChA.Empty()){DescChA+=' ';} DescChA+=TokStr; } } } OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr())); } } } } }
void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV, |
TUrlV & | OutRedirUrlV | ||
) | const |
Definition at line 1230 of file html.cpp.
References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), GetHttpBodyAsStr(), THtmlDoc::GetTok(), THtmlDoc::GetToks(), GetUrlStr(), hsyBTag, TUrl::IsOk(), New(), and usHttp.
{ // create outgoing url vector OutUrlV.Clr(); OutRedirUrlV.Clr(); // take interesting web-page components TStr UrlStr=GetUrlStr(); TStr HtmlStr=GetHttpBodyAsStr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); PHtmlTok Tok; // traverse html for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ PHtmlTok Tok=HtmlDoc->GetTok(TokN); if (Tok->GetSym()==hsyBTag){ TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, UrlStr); if (Url->IsOk(usHttp)){ OutUrlV.Add(Url); if (Tok->IsRedirUrlTok()){ OutRedirUrlV.Add(Url); } } } } } }
void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV | ) | const [inline] |
Definition at line 371 of file html.h.
{ TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
PUrl TWebPg::GetUrl | ( | const int & | UrlN = -1 | ) | const [inline] |
int TWebPg::GetUrls | ( | ) | const [inline] |
TStr TWebPg::GetUrlStr | ( | const int & | UrlN = -1 | ) | const [inline] |
Definition at line 355 of file html.h.
Referenced by GetOutDescUrlStrKdV(), and GetOutUrlV().
bool TWebPg::IsTxt | ( | ) | const |
Definition at line 1310 of file html.cpp.
References TCh::CrCh, THttpResp::GetBodyAsStr(), HttpResp, THttpResp::IsContType(), TStr::Len(), TCh::LfCh, TCh::TabCh, and THttp::TextFldVal.
{ if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){ TStr Str=HttpResp->GetBodyAsStr(); int StrLen=Str.Len(); int ChN=0; int PrintChs=0; while ((ChN<100)&&(ChN<StrLen)){ char Ch=Str[ChN++]; if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){ PrintChs++;} } double PrintPrb=double(PrintChs)/double(ChN+1); return PrintPrb>0.9; } else { return false; } }
static PWebPg TWebPg::Load | ( | TSIn & | ) | [inline, static] |
static PWebPg TWebPg::New | ( | const TStrV & | UrlStrV, |
const TStrV & | IpNumV, | ||
const PHttpResp & | HttpResp | ||
) | [inline, static] |
Definition at line 340 of file html.h.
Referenced by GetOutDescUrlStrKdV(), GetOutUrlV(), SaveAsHttp(), and SaveAsHttpBody().
{ return new TWebPg(UrlStrV, IpNumV, HttpResp);}
static PWebPg TWebPg::New | ( | const TStrV & | UrlStrV, |
const PHttpResp & | HttpResp | ||
) | [inline, static] |
static PWebPg TWebPg::New | ( | const TStr & | UrlStr, |
const PHttpResp & | HttpResp | ||
) | [inline, static] |
void TWebPg::PutFetchMSecs | ( | const uint64 & | _FetchMSecs | ) | [inline] |
Definition at line 376 of file html.h.
{FetchMSecs=_FetchMSecs;}
void TWebPg::Save | ( | TSOut & | ) | [inline] |
void TWebPg::SaveAsHttp | ( | const TStr & | FNm | ) | const |
Definition at line 1303 of file html.cpp.
References HttpResp, New(), and THttpResp::SaveTxt().
{ // create output file PSOut SOut=TFOut::New(FNm); // save http HttpResp->SaveTxt(SOut); }
void TWebPg::SaveAsHttpBody | ( | const TStr & | FNm | ) | const |
Definition at line 1296 of file html.cpp.
References HttpResp, New(), and THttpResp::SaveBody().
{ // create output file PSOut SOut=TFOut::New(FNm); // save http-body HttpResp->SaveBody(SOut); }
TCRef TWebPg::CRef [private] |
uint64 TWebPg::FetchMSecs [private] |
PHttpResp TWebPg::HttpResp [private] |
Definition at line 334 of file html.h.
Referenced by IsTxt(), SaveAsHttp(), and SaveAsHttpBody().
TStrV TWebPg::IpNumV [private] |
TStrV TWebPg::UrlStrV [private] |