SNAP Library 2.1, Developer Reference
2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 #include "stdafx.h" 00003 #include "agmfast.h" 00004 #include "Snap.h" 00005 #include "agm.h" 00006 00007 void TAGMFast::Save(TSOut& SOut) { 00008 G->Save(SOut); 00009 F.Save(SOut); 00010 NIDV.Save(SOut); 00011 RegCoef.Save(SOut); 00012 SumFV.Save(SOut); 00013 NodesOk.Save(SOut); 00014 MinVal.Save(SOut); 00015 MaxVal.Save(SOut); 00016 NegWgt.Save(SOut); 00017 NumComs.Save(SOut); 00018 HOVIDSV.Save(SOut); 00019 PNoCom.Save(SOut); 00020 } 00021 00022 void TAGMFast::Load(TSIn& SIn, const int& RndSeed) { 00023 G->Load(SIn); 00024 F.Load(SIn); 00025 NIDV.Load(SIn); 00026 RegCoef.Load(SIn); 00027 SumFV.Load(SIn); 00028 NodesOk.Load(SIn); 00029 MinVal.Load(SIn); 00030 MaxVal.Load(SIn); 00031 NegWgt.Load(SIn); 00032 NumComs.Load(SIn); 00033 HOVIDSV.Load(SIn); 00034 PNoCom.Load(SIn); 00035 Rnd.PutSeed(RndSeed); 00036 } 00037 00038 void TAGMFast::RandomInit(const int InitComs) { 00039 F.Gen(G->GetNodes()); 00040 SumFV.Gen(InitComs); 00041 NumComs = InitComs; 00042 for (int u = 0; u < F.Len(); u++) { 00043 //assign to just one community 00044 int Mem = G->GetNI(u).GetDeg(); 00045 if (Mem > 10) { Mem = 10; } 00046 for (int c = 0; c < Mem; c++) { 00047 int CID = Rnd.GetUniDevInt(InitComs); 00048 AddCom(u, CID, Rnd.GetUniDev()); 00049 } 00050 } 00051 //assign a member to zero-member community (if any) 00052 for (int c = 0; c < SumFV.Len(); c++) { 00053 if (SumFV[c] == 0.0) { 00054 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00055 AddCom(UID, c, Rnd.GetUniDev()); 00056 } 00057 } 00058 } 00059 00060 void TAGMFast::NeighborComInit(const int InitComs) { 00061 //initialize with best neighborhood communities (Gleich et.al. KDD'12) 00062 F.Gen(G->GetNodes()); 00063 SumFV.Gen(InitComs); 00064 NumComs = InitComs; 00065 const int Edges = G->GetEdges(); 00066 //TIntFltH NCPhiH(F.Len()); 00067 TFltIntPrV NIdPhiV(F.Len(), 0); 00068 TIntSet InvalidNIDS(F.Len()); 00069 TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG 00070 TExeTm RunTm; 00071 //compute conductance of neighborhood community 00072 for (int u = 0; u < F.Len(); u++) { 00073 TIntSet NBCmty(G->GetNI(u).GetDeg() + 1); 00074 double Phi; 00075 if (G->GetNI(u).GetDeg() < 5) { //do not include nodes with too few degree 00076 Phi = 1.0; 00077 } else { 00078 TAGMUtil::GetNbhCom(G, u, NBCmty); 00079 IAssert(NBCmty.Len() == G->GetNI(u).GetDeg() + 1); 00080 Phi = TAGMUtil::GetConductance(G, NBCmty, Edges); 00081 } 00082 //NCPhiH.AddDat(u, Phi); 00083 NIdPhiV.Add(TFltIntPr(Phi, u)); 00084 } 00085 NIdPhiV.Sort(true); 00086 printf("conductance computation completed [%s]\n", RunTm.GetTmStr()); 00087 fflush(stdout); 00088 //choose nodes with local minimum in conductance 00089 int CurCID = 0; 00090 for (int ui = 0; ui < NIdPhiV.Len(); ui++) { 00091 int UID = NIdPhiV[ui].Val2; 00092 fflush(stdout); 00093 if (InvalidNIDS.IsKey(UID)) { continue; } 00094 ChosenNIDV.Add(UID); //FOR DEBUG 00095 //add the node and its neighbors to the current community 00096 AddCom(UID, CurCID, 1.0); 00097 TUNGraph::TNodeI NI = G->GetNI(UID); 00098 fflush(stdout); 00099 for (int e = 0; e < NI.GetDeg(); e++) { 00100 AddCom(NI.GetNbrNId(e), CurCID, 1.0); 00101 } 00102 //exclude its neighbors from the next considerations 00103 for (int e = 0; e < NI.GetDeg(); e++) { 00104 InvalidNIDS.AddKey(NI.GetNbrNId(e)); 00105 } 00106 CurCID++; 00107 fflush(stdout); 00108 if (CurCID >= NumComs) { break; } 00109 } 00110 if (NumComs > CurCID) { 00111 printf("%d communities needed to fill randomly\n", NumComs - CurCID); 00112 } 00113 //assign a member to zero-member community (if any) 00114 for (int c = 0; c < SumFV.Len(); c++) { 00115 if (SumFV[c] == 0.0) { 00116 int ComSz = 10; 00117 for (int u = 0; u < ComSz; u++) { 00118 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00119 AddCom(UID, c, Rnd.GetUniDev()); 00120 } 00121 } 00122 } 00123 } 00124 00125 void TAGMFast::SetCmtyVV(const TVec<TIntV>& CmtyVV) { 00126 F.Gen(G->GetNodes()); 00127 SumFV.Gen(CmtyVV.Len()); 00128 NumComs = CmtyVV.Len(); 00129 TIntH NIDIdxH(NIDV.Len()); 00130 if (! NodesOk) { 00131 for (int u = 0; u < NIDV.Len(); u++) { 00132 NIDIdxH.AddDat(NIDV[u], u); 00133 } 00134 } 00135 for (int c = 0; c < CmtyVV.Len(); c++) { 00136 for (int u = 0; u < CmtyVV[c].Len(); u++) { 00137 int UID = CmtyVV[c][u]; 00138 if (! NodesOk) { UID = NIDIdxH.GetDat(UID); } 00139 if (G->IsNode(UID)) { 00140 AddCom(UID, c, 1.0); 00141 } 00142 } 00143 } 00144 } 00145 00146 void TAGMFast::SetGraph(const PUNGraph& GraphPt) { 00147 G = GraphPt; 00148 HOVIDSV.Gen(G->GetNodes()); 00149 NodesOk = true; 00150 GraphPt->GetNIdV(NIDV); 00151 // check that nodes IDs are {0,1,..,Nodes-1} 00152 for (int nid = 0; nid < GraphPt->GetNodes(); nid++) { 00153 if (! GraphPt->IsNode(nid)) { 00154 NodesOk = false; 00155 break; 00156 } 00157 } 00158 if (! NodesOk) { 00159 printf("rearrage nodes\n"); 00160 G = TSnap::GetSubGraph(GraphPt, NIDV, true); 00161 for (int nid = 0; nid < G->GetNodes(); nid++) { 00162 IAssert(G->IsNode(nid)); 00163 } 00164 } 00165 TSnap::DelSelfEdges(G); 00166 00167 PNoCom = 1.0 / (double) G->GetNodes(); 00168 DoParallel = false; 00169 if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow 00170 NegWgt = 1.0; 00171 } 00172 00173 double TAGMFast::Likelihood(const bool _DoParallel) { 00174 TExeTm ExeTm; 00175 double L = 0.0; 00176 if (_DoParallel) { 00177 #pragma omp parallel for 00178 for (int u = 0; u < F.Len(); u++) { 00179 double LU = LikelihoodForRow(u); 00180 #pragma omp atomic 00181 L += LU; 00182 } 00183 } 00184 else { 00185 for (int u = 0; u < F.Len(); u++) { 00186 double LU = LikelihoodForRow(u); 00187 L += LU; 00188 } 00189 } 00190 return L; 00191 } 00192 00193 double TAGMFast::LikelihoodForRow(const int UID) { 00194 return LikelihoodForRow(UID, F[UID]); 00195 } 00196 00197 00198 double TAGMFast::LikelihoodForRow(const int UID, const TIntFltH& FU) { 00199 double L = 0.0; 00200 TFltV HOSumFV; //adjust for Fv of v hold out 00201 if (HOVIDSV[UID].Len() > 0) { 00202 HOSumFV.Gen(SumFV.Len()); 00203 00204 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00205 for (int c = 0; c < SumFV.Len(); c++) { 00206 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00207 } 00208 } 00209 } 00210 00211 TUNGraph::TNodeI NI = G->GetNI(UID); 00212 if (DoParallel && NI.GetDeg() > 10) { 00213 #pragma omp parallel for schedule(static, 1) 00214 for (int e = 0; e < NI.GetDeg(); e++) { 00215 int v = NI.GetNbrNId(e); 00216 if (v == UID) { continue; } 00217 if (HOVIDSV[UID].IsKey(v)) { continue; } 00218 double LU = log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]); 00219 #pragma omp atomic 00220 L += LU; 00221 } 00222 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00223 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00224 double LU = NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat(); 00225 L -= LU; 00226 } 00227 } else { 00228 for (int e = 0; e < NI.GetDeg(); e++) { 00229 int v = NI.GetNbrNId(e); 00230 if (v == UID) { continue; } 00231 if (HOVIDSV[UID].IsKey(v)) { continue; } 00232 L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]); 00233 } 00234 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00235 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00236 L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat(); 00237 } 00238 } 00239 //add regularization 00240 if (RegCoef > 0.0) { //L1 00241 L -= RegCoef * Sum(FU); 00242 } 00243 if (RegCoef < 0.0) { //L2 00244 L += RegCoef * Norm2(FU); 00245 } 00246 00247 return L; 00248 } 00249 00250 void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { 00251 GradU.Gen(CIDSet.Len()); 00252 00253 TFltV HOSumFV; //adjust for Fv of v hold out 00254 if (HOVIDSV[UID].Len() > 0) { 00255 HOSumFV.Gen(SumFV.Len()); 00256 00257 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00258 for (int c = 0; c < SumFV.Len(); c++) { 00259 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00260 } 00261 } 00262 } 00263 00264 TUNGraph::TNodeI NI = G->GetNI(UID); 00265 int Deg = NI.GetDeg(); 00266 TFltV PredV(Deg), GradV(CIDSet.Len()); 00267 TIntV CIDV(CIDSet.Len()); 00268 if (DoParallel && Deg + CIDSet.Len() > 10) { 00269 #pragma omp parallel for schedule(static, 1) 00270 for (int e = 0; e < Deg; e++) { 00271 if (NI.GetNbrNId(e) == UID) { continue; } 00272 if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } 00273 PredV[e] = Prediction(UID, NI.GetNbrNId(e)); 00274 } 00275 00276 #pragma omp parallel for schedule(static, 1) 00277 for (int c = 0; c < CIDSet.Len(); c++) { 00278 int CID = CIDSet.GetKey(c); 00279 double Val = 0.0; 00280 for (int e = 0; e < Deg; e++) { 00281 int VID = NI.GetNbrNId(e); 00282 if (VID == UID) { continue; } 00283 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00284 Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); 00285 } 00286 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00287 Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); 00288 CIDV[c] = CID; 00289 GradV[c] = Val; 00290 } 00291 } 00292 else { 00293 for (int e = 0; e < Deg; e++) { 00294 if (NI.GetNbrNId(e) == UID) { continue; } 00295 if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } 00296 PredV[e] = Prediction(UID, NI.GetNbrNId(e)); 00297 } 00298 00299 for (int c = 0; c < CIDSet.Len(); c++) { 00300 int CID = CIDSet.GetKey(c); 00301 double Val = 0.0; 00302 for (int e = 0; e < Deg; e++) { 00303 int VID = NI.GetNbrNId(e); 00304 if (VID == UID) { continue; } 00305 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00306 Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); 00307 } 00308 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00309 Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); 00310 CIDV[c] = CID; 00311 GradV[c] = Val; 00312 } 00313 } 00314 //add regularization 00315 if (RegCoef > 0.0) { //L1 00316 for (int c = 0; c < GradV.Len(); c++) { 00317 GradV[c] -= RegCoef; 00318 } 00319 } 00320 if (RegCoef < 0.0) { //L2 00321 for (int c = 0; c < GradV.Len(); c++) { 00322 GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 00323 } 00324 } 00325 00326 00327 for (int c = 0; c < GradV.Len(); c++) { 00328 if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } 00329 if (fabs(GradV[c]) < 0.0001) { continue; } 00330 GradU.AddDat(CIDV[c], GradV[c]); 00331 } 00332 for (int c = 0; c < GradU.Len(); c++) { 00333 if (GradU[c] >= 10) { GradU[c] = 10; } 00334 if (GradU[c] <= -10) { GradU[c] = -10; } 00335 IAssert(GradU[c] >= -10); 00336 } 00337 } 00338 00339 double TAGMFast::GradientForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00340 TUNGraph::TNodeI UI = G->GetNI(UID); 00341 double Grad = 0.0, PNoEdge; 00342 int VID = 0; 00343 for (int e = 0; e < UI.GetDeg(); e++) { 00344 VID = UI.GetNbrNId(e); 00345 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00346 if (! F[VID].IsKey(CID)) { continue; } 00347 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00348 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00349 //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge; 00350 Grad += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID)); 00351 } 00352 Grad -= NegWgt * (SumFV[CID] - GetCom(UID, CID)); 00353 //add regularization 00354 if (RegCoef > 0.0) { //L1 00355 Grad -= RegCoef; 00356 } 00357 if (RegCoef < 0.0) { //L2 00358 Grad += 2 * RegCoef * Val; 00359 } 00360 00361 return Grad; 00362 } 00363 00364 double TAGMFast::LikelihoodForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00365 TUNGraph::TNodeI UI = G->GetNI(UID); 00366 double L = 0.0, PNoEdge; 00367 int VID = 0; 00368 for (int e = 0; e < UI.GetDeg(); e++) { 00369 VID = UI.GetNbrNId(e); 00370 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00371 if (! F[VID].IsKey(CID)) { 00372 PNoEdge = AlphaKV[e]; 00373 } else { 00374 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00375 } 00376 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00377 //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge; 00378 L += log(1.0 - PNoEdge) + NegWgt * GetCom(VID, CID) * Val; 00379 00380 // += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID)); 00381 } 00382 L -= NegWgt * (SumFV[CID] - GetCom(UID, CID)) * Val; 00383 //add regularization 00384 if (RegCoef > 0.0) { //L1 00385 L -= RegCoef * Val; 00386 } 00387 if (RegCoef < 0.0) { //L2 00388 L += RegCoef * Val * Val; 00389 } 00390 00391 return L; 00392 } 00393 00394 double TAGMFast::HessianForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00395 TUNGraph::TNodeI UI = G->GetNI(UID); 00396 double H = 0.0, PNoEdge; 00397 int VID = 0; 00398 for (int e = 0; e < UI.GetDeg(); e++) { 00399 VID = UI.GetNbrNId(e); 00400 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00401 if (! F[VID].IsKey(CID)) { continue; } 00402 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00403 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00404 //PNoEdge = PNoEdge == 1.0? 1 - PNoCom: PNoEdge; 00405 H += (- PNoEdge * F[VID].GetDat(CID) * F[VID].GetDat(CID)) / (1.0 - PNoEdge) / (1.0 - PNoEdge); 00406 } 00407 //add regularization 00408 if (RegCoef < 0.0) { //L2 00409 H += 2 * RegCoef; 00410 } 00411 IAssert (H <= 0.0); 00412 return H; 00413 } 00414 00416 int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { 00417 TExeTm ExeTm; 00418 int iter = 0, PrevIter = 0; 00419 TIntFltPrV IterLV; 00420 double PrevL = TFlt::Mn, CurL; 00421 TUNGraph::TNodeI UI; 00422 TIntV NIdxV; 00423 G->GetNIdV(NIdxV); 00424 int CID, UID, NewtonIter; 00425 double Fuc; 00426 //double PrevFuc; 00427 double Grad, H; 00428 while(iter < MaxIter) { 00429 NIdxV.Shuffle(Rnd); 00430 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00431 if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { 00432 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00433 } 00434 UID = NIdxV[ui]; 00435 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00436 TIntSet CIDSet; 00437 UI = G->GetNI(UID); 00438 if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip 00439 if (! F[UID].Empty()) { F[UID].Clr(); } 00440 continue; 00441 } 00442 for (int e = 0; e < UI.GetDeg(); e++) { 00443 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00444 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00445 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00446 CIDSet.AddKey(CI.GetKey()); 00447 } 00448 } 00449 for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00450 if (! CIDSet.IsKey(CI.GetKey())) { 00451 DelCom(UID, CI.GetKey()); 00452 } 00453 } 00454 if (CIDSet.Empty()) { continue; } 00455 for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { 00456 CID = CI.GetKey(); 00457 //optimize for UID, CID 00458 //compute constants 00459 TFltV AlphaKV(UI.GetDeg()); 00460 for (int e = 0; e < UI.GetDeg(); e++) { 00461 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00462 AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); 00463 IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); 00464 } 00465 Fuc = GetCom(UID, CID); 00466 //PrevFuc = Fuc; 00467 Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; 00468 if (Grad <= 1e-3 && Grad >= -0.1) { continue; } 00469 NewtonIter = 0; 00470 while (NewtonIter++ < 10) { 00471 Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; 00472 H = HessianForOneVar(AlphaKV, UID, CID, Fuc); 00473 if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } 00474 if (fabs(Grad) < 1e-3) { break; } 00475 if (H == 0.0) { Fuc = 0.0; break; } 00476 double NewtonStep = - Grad / H; 00477 if (NewtonStep < -0.5) { NewtonStep = - 0.5; } 00478 Fuc += NewtonStep; 00479 if (Fuc < 0.0) { Fuc = 0.0; } 00480 } 00481 if (Fuc == 0.0) { 00482 DelCom(UID, CID); 00483 } 00484 else { 00485 AddCom(UID, CID, Fuc); 00486 } 00487 } 00488 } 00489 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00490 PrevIter = iter; 00491 CurL = Likelihood(); 00492 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00493 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00494 } 00495 fflush(stdout); 00496 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00497 else { PrevL = CurL; } 00498 } 00499 00500 } 00501 if (! PlotNm.Empty()) { 00502 printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00503 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00504 } 00505 return iter; 00506 } 00507 00508 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV) { 00509 GetCmtyVV(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3); 00510 } 00511 00513 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00514 CmtyVV.Gen(NumComs, 0); 00515 TIntFltH CIDSumFH(NumComs); 00516 for (int c = 0; c < SumFV.Len(); c++) { 00517 CIDSumFH.AddDat(c, SumFV[c]); 00518 } 00519 CIDSumFH.SortByDat(false); 00520 for (int c = 0; c < NumComs; c++) { 00521 int CID = CIDSumFH.GetKey(c); 00522 TIntFltH NIDFucH(F.Len() / 10); 00523 TIntV CmtyV; 00524 IAssert(SumFV[CID] == CIDSumFH.GetDat(CID)); 00525 if (SumFV[CID] < Thres) { continue; } 00526 for (int u = 0; u < F.Len(); u++) { 00527 int NID = u; 00528 if (! NodesOk) { NID = NIDV[u]; } 00529 if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); } 00530 } 00531 NIDFucH.SortByDat(false); 00532 NIDFucH.GetKeyV(CmtyV); 00533 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00534 } 00535 if ( NumComs != CmtyVV.Len()) { 00536 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00537 } 00538 } 00539 00541 int TAGMFast::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const double StepAlpha, const double StepBeta) { 00542 double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs); 00543 TIntV ComsV; 00544 ComsV.Add(MinComs); 00545 while (ComsV.Len() < DivComs) { 00546 int NewComs = int(ComsV.Last() * ComsGap); 00547 if (NewComs == ComsV.Last().Val) { NewComs++; } 00548 ComsV.Add(NewComs); 00549 } 00550 if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); } 00551 return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm + ".CV.likelihood", StepAlpha, StepBeta); 00552 } 00553 00554 int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) { 00555 if (ComsV.Len() == 0) { 00556 int MaxComs = G->GetNodes() / 5; 00557 ComsV.Add(2); 00558 while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } 00559 } 00560 TIntPrV EdgeV(G->GetEdges(), 0); 00561 for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) { 00562 EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId())); 00563 } 00564 EdgeV.Shuffle(Rnd); 00565 int MaxIterCV = 3; 00566 00567 TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV); 00568 if (EdgeV.Len() > 50) { //if edges are many enough, use CV 00569 printf("generating hold out set\n"); 00570 TIntV NIdV1, NIdV2; 00571 G->GetNIdV(NIdV1); 00572 G->GetNIdV(NIdV2); 00573 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00574 // generate holdout sets 00575 HoldOutSets[IterCV].Gen(G->GetNodes()); 00576 const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); 00577 int HOCnt = 0; 00578 int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); 00579 printf("holding out %d edges...\n", HOEdges); 00580 for (int he = 0; he < (int) HOEdges; he++) { 00581 HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2); 00582 HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1); 00583 HOCnt++; 00584 } 00585 printf("%d Edges hold out\n", HOCnt); 00586 while(HOCnt++ < HOTotal) { 00587 int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); 00588 int DstNID = Rnd.GetUniDevInt(G->GetNodes()); 00589 HoldOutSets[IterCV][SrcNID].AddKey(DstNID); 00590 HoldOutSets[IterCV][DstNID].AddKey(SrcNID); 00591 } 00592 } 00593 printf("hold out set generated\n"); 00594 } 00595 00596 TFltV HOLV(ComsV.Len()); 00597 TIntFltPrV ComsLV; 00598 for (int c = 0; c < ComsV.Len(); c++) { 00599 const int Coms = ComsV[c]; 00600 printf("Try number of Coms:%d\n", Coms); 00601 NeighborComInit(Coms); 00602 printf("Initialized\n"); 00603 00604 if (EdgeV.Len() > 50) { //if edges are many enough, use CV 00605 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00606 HOVIDSV = HoldOutSets[IterCV]; 00607 00608 if (NumThreads == 1) { 00609 printf("MLE without parallelization begins\n"); 00610 MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta); 00611 } else { 00612 printf("MLE with parallelization begins\n"); 00613 MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta); 00614 } 00615 double HOL = LikelihoodHoldOut(); 00616 HOL = HOL < 0? HOL: TFlt::Mn; 00617 HOLV[c] += HOL; 00618 } 00619 } 00620 else { 00621 HOVIDSV.Gen(G->GetNodes()); 00622 MLEGradAscent(0.0001, 100 * G->GetNodes(), ""); 00623 double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes()); 00624 HOLV[c] = BIC; 00625 } 00626 } 00627 int EstComs = 2; 00628 double MaxL = TFlt::Mn; 00629 printf("\n"); 00630 for (int c = 0; c < ComsV.Len(); c++) { 00631 ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); 00632 printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); 00633 if (MaxL < HOLV[c]) { 00634 MaxL = HOLV[c]; 00635 EstComs = ComsV[c]; 00636 } 00637 } 00638 printf("\n"); 00639 RandomInit(EstComs); 00640 HOVIDSV.Gen(G->GetNodes()); 00641 if (! PlotLFNm.Empty()) { 00642 TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); 00643 } 00644 return EstComs; 00645 } 00646 00647 double TAGMFast::LikelihoodHoldOut(const bool DoParallel) { 00648 double L = 0.0; 00649 for (int u = 0; u < HOVIDSV.Len(); u++) { 00650 for (int e = 0; e < HOVIDSV[u].Len(); e++) { 00651 int VID = HOVIDSV[u][e]; 00652 if (VID == u) { continue; } 00653 double Pred = Prediction(u, VID); 00654 if (G->IsEdge(u, VID)) { 00655 L += log(1.0 - Pred); 00656 } 00657 else { 00658 L += NegWgt * log(Pred); 00659 } 00660 } 00661 } 00662 return L; 00663 } 00664 00665 double TAGMFast::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) { 00666 double StepSize = 1.0; 00667 double InitLikelihood = LikelihoodForRow(UID); 00668 TIntFltH NewVarV(DeltaV.Len()); 00669 for(int iter = 0; iter < MaxIter; iter++) { 00670 for (int i = 0; i < DeltaV.Len(); i++){ 00671 int CID = DeltaV.GetKey(i); 00672 double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID); 00673 if (NewVal < MinVal) { NewVal = MinVal; } 00674 if (NewVal > MaxVal) { NewVal = MaxVal; } 00675 NewVarV.AddDat(CID, NewVal); 00676 } 00677 if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) { 00678 StepSize *= Beta; 00679 } else { 00680 break; 00681 } 00682 if (iter == MaxIter - 1) { 00683 StepSize = 0.0; 00684 break; 00685 } 00686 } 00687 return StepSize; 00688 } 00689 00690 int TAGMFast::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00691 time_t InitTime = time(NULL); 00692 TExeTm ExeTm, CheckTm; 00693 int iter = 0, PrevIter = 0; 00694 TIntFltPrV IterLV; 00695 TUNGraph::TNodeI UI; 00696 double PrevL = TFlt::Mn, CurL = 0.0; 00697 TIntV NIdxV(F.Len(), 0); 00698 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00699 IAssert(NIdxV.Len() == F.Len()); 00700 TIntFltH GradV; 00701 while(iter < MaxIter) { 00702 NIdxV.Shuffle(Rnd); 00703 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00704 int u = NIdxV[ui]; // 00705 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00706 UI = G->GetNI(u); 00707 TIntSet CIDSet(5 * UI.GetDeg()); 00708 for (int e = 0; e < UI.GetDeg(); e++) { 00709 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00710 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00711 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00712 CIDSet.AddKey(CI.GetKey()); 00713 } 00714 } 00715 for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00716 if (! CIDSet.IsKey(CI.GetKey())) { 00717 DelCom(u, CI.GetKey()); 00718 } 00719 } 00720 if (CIDSet.Empty()) { continue; } 00721 GradientForRow(u, GradV, CIDSet); 00722 if (Norm2(GradV) < 1e-4) { continue; } 00723 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta); 00724 if (LearnRate == 0.0) { continue; } 00725 for (int ci = 0; ci < GradV.Len(); ci++) { 00726 int CID = GradV.GetKey(ci); 00727 double Change = LearnRate * GradV.GetDat(CID); 00728 double NewFuc = GetCom(u, CID) + Change; 00729 if (NewFuc <= 0.0) { 00730 DelCom(u, CID); 00731 } else { 00732 AddCom(u, CID, NewFuc); 00733 } 00734 } 00735 if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) { 00736 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00737 } 00738 } 00739 printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime); 00740 fflush(stdout); 00741 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00742 PrevIter = iter; 00743 CurL = Likelihood(); 00744 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00745 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00746 } 00747 fflush(stdout); 00748 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00749 else { PrevL = CurL; } 00750 } 00751 00752 } 00753 printf("\n"); 00754 printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00755 if (! PlotNm.Empty()) { 00756 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00757 } 00758 return iter; 00759 } 00760 00761 int TAGMFast::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00762 //parallel 00763 time_t InitTime = time(NULL); 00764 uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs(); 00765 TExeTm ExeTm, CheckTm; 00766 double PrevL = Likelihood(true); 00767 TIntFltPrV IterLV; 00768 int PrevIter = 0; 00769 int iter = 0; 00770 TIntV NIdxV(F.Len(), 0); 00771 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00772 TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization 00773 NIDOPTV.PutAll(0); 00774 TVec<TIntFltH> NewF(ChunkNum * ChunkSize); 00775 TIntV NewNIDV(ChunkNum * ChunkSize); 00776 for (iter = 0; iter < MaxIter; iter++) { 00777 NIdxV.Clr(false); 00778 for (int i = 0; i < F.Len(); i++) { 00779 if (NIDOPTV[i] == 0) { NIdxV.Add(i); } 00780 } 00781 IAssert (NIdxV.Len() <= F.Len()); 00782 NIdxV.Shuffle(Rnd); 00783 // compute gradient for chunk of nodes 00784 #pragma omp parallel for schedule(static, 1) 00785 for (int TIdx = 0; TIdx < ChunkNum; TIdx++) { 00786 TIntFltH GradV; 00787 for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) { 00788 NewNIDV[ui] = -1; 00789 if (ui > NIdxV.Len()) { continue; } 00790 int u = NIdxV[ui]; // 00791 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00792 TUNGraph::TNodeI UI = G->GetNI(u); 00793 TIntSet CIDSet(5 * UI.GetDeg()); 00794 TIntFltH CurFU = F[u]; 00795 for (int e = 0; e < UI.GetDeg(); e++) { 00796 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00797 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00798 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00799 CIDSet.AddKey(CI.GetKey()); 00800 } 00801 } 00802 if (CIDSet.Empty()) { 00803 CurFU.Clr(); 00804 } 00805 else { 00806 for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00807 if (! CIDSet.IsKey(CI.GetKey())) { 00808 CurFU.DelIfKey(CI.GetKey()); 00809 } 00810 } 00811 GradientForRow(u, GradV, CIDSet); 00812 if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; } 00813 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta, 5); 00814 if (LearnRate <= 1e-5) { NewNIDV[ui] = -2; continue; } 00815 for (int ci = 0; ci < GradV.Len(); ci++) { 00816 int CID = GradV.GetKey(ci); 00817 double Change = LearnRate * GradV.GetDat(CID); 00818 double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change; 00819 if (NewFuc <= 0.0) { 00820 CurFU.DelIfKey(CID); 00821 } else { 00822 CurFU.AddDat(CID) = NewFuc; 00823 } 00824 } 00825 CurFU.Defrag(); 00826 } 00827 //store changes 00828 NewF[ui] = CurFU; 00829 NewNIDV[ui] = u; 00830 } 00831 } 00832 int NumNoChangeGrad = 0; 00833 int NumNoChangeStepSize = 0; 00834 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00835 int NewNID = NewNIDV[ui]; 00836 if (NewNID == -1) { NumNoChangeGrad++; continue; } 00837 if (NewNID == -2) { NumNoChangeStepSize++; continue; } 00838 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00839 SumFV[CI.GetKey()] -= CI.GetDat(); 00840 } 00841 } 00842 #pragma omp parallel for 00843 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00844 int NewNID = NewNIDV[ui]; 00845 if (NewNID < 0) { continue; } 00846 F[NewNID] = NewF[ui]; 00847 } 00848 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00849 int NewNID = NewNIDV[ui]; 00850 if (NewNID < 0) { continue; } 00851 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00852 SumFV[CI.GetKey()] += CI.GetDat(); 00853 } 00854 } 00855 // update the nodes who are optimal 00856 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00857 int NewNID = NewNIDV[ui]; 00858 if (NewNID < 0) { continue; } 00859 TUNGraph::TNodeI UI = G->GetNI(NewNID); 00860 NIDOPTV[NewNID] = 0; 00861 for (int e = 0; e < UI.GetDeg(); e++) { 00862 NIDOPTV[UI.GetNbrNId(e)] = 0; 00863 } 00864 } 00865 int OPTCnt = 0; 00866 for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } } 00867 if (! PlotNm.Empty()) { 00868 printf("\r%d iterations [%s] %d secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); 00869 if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); } 00870 fflush(stdout); 00871 } 00872 if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) { 00873 PrevIter = iter; 00874 double CurL = Likelihood(true); 00875 IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL)); 00876 printf("\r%d iterations, Likelihood: %f, Diff: %f [%d secs]", iter, CurL, CurL - PrevL, int(time(NULL) - InitTime)); 00877 fflush(stdout); 00878 if (CurL - PrevL <= Thres * fabs(PrevL)) { 00879 break; 00880 } 00881 else { 00882 PrevL = CurL; 00883 } 00884 } 00885 } 00886 if (! PlotNm.Empty()) { 00887 printf("\nMLE completed with %d iterations(%d secs)\n", iter, int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); 00888 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00889 } else { 00890 printf("\rMLE completed with %d iterations(%d secs)", iter, int(time(NULL) - InitTime)); 00891 fflush(stdout); 00892 } 00893 return iter; 00894 }