00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "graphbuilder.h"
00023
00024 #define Mb 1048576L
00025
00026
00027
00028
00029
00030 GraphBuilder :: GraphBuilder(int smem, int jmem, int nmem, int lmem, bool sl) {
00031
00032 trie = new Trie((smem * Mb), (jmem * Mb)/sizeof(SimpleHashPair<char*>));
00033
00034
00035 long numels = (nmem * Mb)/sizeof(SimpleHashPair<WebNodePtr>);
00036 nodetable = new SimpleWebNodePtrHashTable(numels);
00037
00038
00039 graph = new WebLinkGraph();
00040 graph_is_docked = true;
00041
00042
00043 urlfilter = new URLFilter(true);
00044 linkset = new RawLinkSet;
00045
00046 curdoc = NULL;
00047 doc_url[0] = 0;
00048 doc_alias[0] = 0;
00049 curdoc_baseurl.Clear();
00050
00051
00052 stats.heap_used_webnodes = 0;
00053
00054 stats.cumulative_tolinks = 0;
00055 stats.cumulative_fromlinks = 0;
00056 stats.cumulative_dangling = 0;
00057 stats.cumulative_leaflinks = 0;
00058
00059 stats.nodetable_insertions = 0;
00060 stats.nodetable_alias_insertions = 0;
00061
00062 stats.lowid = kint32max;
00063 stats.highid = 0;
00064 stats.lowdate = kuint16max;
00065 stats.highdate = 0;
00066
00067 flags.show_links = sl;
00068 flags.leaftable_memory = lmem;
00069
00070 }
00071
00072 GraphBuilder :: ~GraphBuilder() {
00073
00074 delete linkset;
00075 delete nodetable;
00076 delete trie;
00077 if( leaftable ) { delete leaftable; }
00078 }
00079
00080 void GraphBuilder :: StatisticsMem(ostream& o) {
00081
00082 trie->Statistics(o);
00083
00084 o << "\nTotal number of insertions in nodetable: " << stats.nodetable_insertions << endl;
00085 o << "Total number of aliases in nodetable: " << stats.nodetable_alias_insertions << endl;
00086 o << "Heap needed for nodetable (bytes): " << ((stats.nodetable_insertions + stats.nodetable_alias_insertions) * sizeof(SimpleHashPair<WebNodePtr>))<< endl;
00087 o << "Heap reserved for nodetable (bytes): " << (nodetable->Size() * sizeof(SimpleHashPair<WebNodePtr>)) << endl;
00088
00089 o << "\nMinimum size of each webnode (bytes): " << sizeof(WebNode) << endl;
00090 o << "Heap needed for all webnodes (bytes): " << stats.heap_used_webnodes << endl;
00091 o << "Unallocated webnodes (bytes): " << WebNode::FreeBlocks() << endl;
00092 o << "Unallocated tolinks (bytes): " << MemPool<LinkStruct>::FreeBlocks1() + MemPool<LinkStruct>::FreeBlocks2() << endl;
00093 }
00094
00095 void GraphBuilder :: StatisticsGraph(ostream& o) {
00096 o << "Total number of nodes in web graph: " << graph->size() << endl;
00097 o << "Lowest ID number in web graph: " << stats.lowid << endl;
00098 o << "Highest ID number in web graph: " << stats.highid << endl;
00099 o << "Lowest date in web graph: " << stats.lowdate << endl;
00100 o << "Highest date in web graph: " << stats.highdate << endl;
00101
00102 o << "\nAverage number of tolinks/node: " << static_cast<double>(stats.cumulative_tolinks)/graph->size() << endl;
00103 o << "Average number of dangling tolinks/node: " << static_cast<double>(stats.cumulative_dangling)/graph->size() << endl;
00104 o << "Average number of fromlinks/node: " << static_cast<double>(stats.cumulative_fromlinks)/graph->size() << endl;
00105 o << "Average number of leaflinks/node: " << static_cast<double>(stats.cumulative_leaflinks)/graph->size() << endl;
00106 }
00107
00108
00109 void GraphBuilder :: NodeInitialize(uint32 idno) {
00110
00111 assert( !curdoc );
00112
00113 curdoc = new WebNode(idno);
00114 doc_url[0] = 0;
00115 doc_alias[0] = 0;
00116 curdoc_baseurl.Clear();
00117
00118
00119
00120 stats.lowid = min(stats.lowid, idno);
00121 stats.highid = max(stats.highid, idno);
00122
00123 }
00124
00125
00126
00127
00128
00129
00130
00131 void GraphBuilder :: NodeLaunch() {
00132
00133 assert( curdoc );
00134
00135 stats.lowdate = (curdoc->Date() > 0) ?
00136 min(stats.lowdate, curdoc->Date()) : stats.lowdate;
00137 stats.highdate = max(stats.highdate, curdoc->Date());
00138
00139 graph->push_front(curdoc);
00140 linkset->clear();
00141
00142 curdoc = NULL;
00143 doc_url[0] = 0;
00144 doc_alias[0] = 0;
00145 curdoc_baseurl.Clear();
00146
00147 }
00148
00149
00150 void GraphBuilder :: NodeSetDate(unsigned short adate) {
00151 curdoc->SetDate(adate);
00152 }
00153
00154
00155
00156
00157
00158 void GraphBuilder :: NodeInsertLinks() {
00159 curdoc->InsertRawLinks(linkset);
00160 }
00161
00162
00163
00164
00165
00166
00167 void GraphBuilder :: NodeSetURL(const char *docurl, const char *aliasurl) {
00168
00169 docurl__ = docurl;
00170 aliasurl__ = aliasurl;
00171
00172 assert( !strlen(doc_url) );
00173
00174
00175
00176 ContentType contype;
00177 strcpy(doc_url, urlfilter->FormatURL(docurl, strlen(docurl), NULL, &contype));
00178
00179
00180
00181
00182 {
00183 ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(doc_url)));
00184
00185 WebNodePtr w = nodetable->Find(key);
00186
00187 if( !w ) {
00188
00189 assert(curdoc);
00190
00191 nodetable->Insert(key, curdoc);
00192
00193 stats.nodetable_insertions++;
00194 } else {
00195 cerr << "warning: aliased or duplicate node [" << doc_url << "]" << endl;
00196
00197
00198 delete curdoc;
00199 curdoc = w;
00200
00201 }
00202 }
00203
00204
00205
00206 urlfilter->ParseURL(doc_url, curdoc_baseurl.scheme, curdoc_baseurl.netloc,
00207 curdoc_baseurl.query, curdoc_baseurl.params, curdoc_baseurl.path);
00208
00209
00210 if( aliasurl ) {
00211 strcpy(doc_alias, urlfilter->FormatURL(aliasurl, strlen(aliasurl), NULL, &contype));
00212
00213 ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(doc_alias)));
00214
00215 WebNodePtr w = nodetable->Find(key);
00216 if( !w ) {
00217 assert(curdoc);
00218 nodetable->Insert(key, curdoc);
00219 stats.nodetable_alias_insertions++;
00220 } else {
00221
00222 if(w != curdoc) {
00223 cerr << "warning: alias points to a different node. Dataset inconsistent? [" << doc_alias << "]" << endl;
00224
00225 };
00226 }
00227 }
00228
00229 if( flags.show_links ) {
00230 cout << doc_url << endl;
00231 }
00232 }
00233
00234 const char * GraphBuilder :: NodeGetURL_() {
00235 return docurl__;
00236 }
00237
00238 const char * GraphBuilder :: NodeGetAlias_() {
00239 return aliasurl__;
00240 }
00241
00242 const char * GraphBuilder :: NodeGetURL() {
00243 return doc_url;
00244 }
00245
00246 const char * GraphBuilder :: NodeGetAlias() {
00247 return doc_alias;
00248 }
00249
00250 const uint32 GraphBuilder :: NodeGetID() {
00251 return curdoc->ID();
00252 }
00253
00254
00255 const uint16 GraphBuilder :: NodeGetDate() {
00256 return curdoc->Date();
00257 }
00258
00259
00260
00261 WebLinkGraph* GraphBuilder :: UndockWebGraph() {
00262 cerr << "info: now undocking webgraph" << endl;
00263 if( graph_is_docked ) {
00264
00265 for(WebLinkGraph::iterator i = graph->begin(); i != graph->end(); i++) {
00266 assert(*i);
00267
00268
00269 (*i)->NormalizeRawLinks(nodetable);
00270 }
00271
00272
00273 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00274 assert(*i);
00275 for(int k = 0; k < (*i)->NumberOfValidToLinks(); k++) {
00276 assert((*i)->ValidToLink(k));
00277 ((*i)->ValidToLink(k))->IncrementNumberOfFromLinks();
00278 }
00279 }
00280
00281
00282 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00283 assert(*i);
00284 for(int k = 0; k < (*i)->NumberOfValidToLinks(); k++) {
00285 assert((*i)->ValidToLink(k));
00286 ((*i)->ValidToLink(k))->AppendFromLink(*i);
00287 }
00288 stats.heap_used_webnodes += (*i)->RealSize();
00289 stats.cumulative_dangling += (*i)->NumberOfDanglingToLinks();
00290 stats.cumulative_fromlinks += (*i)->NumberOfValidFromLinks();
00291 stats.cumulative_leaflinks += (*i)->NumberOfLeafLinks();
00292 stats.cumulative_tolinks += (*i)->NumberOfValidToLinks();
00293 }
00294
00295 #ifndef NDEBUG
00296
00297 cerr << "debug: check that fromlinks work properly" << endl;
00298 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00299 for(int k = 0; k < (*i)->NumberOfValidFromLinks(); k++) {
00300 assert((*i)->ValidFromLink(k));
00301 }
00302 }
00303 #endif
00304 graph_is_docked = false;
00305 }
00306 cerr << "info: webgraph is now undocked" << endl;
00307 return graph;
00308 }
00309
00310 void GraphBuilder :: UpdateLeafLinks() {
00311 assert(leaftable);
00312 stats.cumulative_dangling = 0;
00313 stats.cumulative_leaflinks = 0;
00314 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00315 assert(*i);
00316 (*i)->UpdateLeafLinks(leaftable);
00317 stats.cumulative_leaflinks += (*i)->NumberOfLeafLinks();
00318 stats.cumulative_dangling += (*i)->NumberOfDanglingToLinks();
00319 }
00320
00321 #ifndef NDEBUG
00322
00323 cerr << "debug: check that leaflinks work properly" << endl;
00324 for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00325 for(int k = 0; k < (*i)->NumberOfLeafLinks(); k++) {
00326 assert((*i)->ValidLeafLink(k));
00327 assert((*i)->ValidLeafLink(k)->OccupationCount() == 0);
00328 }
00329 }
00330 #endif
00331
00332 }
00333
00334
00335 WebNode* GraphBuilder :: FindWebNode(const char *url) {
00336 ptrdiff_t key = trie->FindURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00337 if( key > -1 ) {
00338 WebNodePtr w = nodetable->Find(key);
00339 if( w ) {
00340 return w;
00341 }
00342 }
00343 return NULL;
00344 }
00345
00346
00347 const ptrdiff_t GraphBuilder :: FindLeafNodeKey(const char *url) {
00348 return trie->FindURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00349 }
00350
00351 void GraphBuilder :: SetupLeafTable() {
00352 if( !leaftable ) {
00353 leaftable = new SimpleLeafNodePtrHashTable((flags.leaftable_memory * Mb)/sizeof(SimpleHashPair<LeafNodePtr>));
00354 }
00355 leaftable->Clear();
00356 }
00357
00358 void GraphBuilder :: AddLeaf(const ptrdiff_t key, LeafNodePtr leaf) {
00359 assert(leaftable);
00360 if( !leaftable->Find(key) ) {
00361 leaftable->Insert(key, leaf);
00362 } else {
00363 cerr << "warning: duplicate leaf ignored" << endl;
00364 }
00365
00366 stats.lowdate = (leaf->Date() > 0) ?
00367 min(stats.lowdate, leaf->Date()) : stats.lowdate;
00368 stats.highdate = max(stats.highdate, leaf->Date());
00369 }
00370
00371 void GraphBuilder :: TrieInsertLinkURL(const char *url) {
00372 ptrdiff_t key = trie->InsertURL(urlfilter->CompressURL(urlfilter->DeindexURL(url)));
00373 assert(*(trie->bigs + key) == 0);
00374 linkset->insert(key);
00375 }
00376