00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147 #include <fstream>
00148 #include <sstream>
00149 #include <sys/stat.h>
00150 #include "talker.h"
00151 #include <gsl/gsl_randist.h>
00152
00153 extern int num_docs_processed;
00154 static char helptext[] = "Available commands are:\n"
00155 "quit - exits the program\n"
00156 "help - displays this text\n"
00157 "info mem - displays memory usage statistics\n"
00158 "\n"
00159 "info sampler - displays information about the selected sampler\n"
00160 "set sampler pagerank epsilon - selects ordinary PageRank algorithm\n"
00161 "set sampler dateb epsilon lambda - selects date biased PageRank algorithm\n"
00162 "set sampler tkleinberg epsilon hubs|auth - selects truncated Kleinberg algorithm\n"
00163 "run sampler n - runs the current Markov chain sampler for n iterations\n"
00164 "load tags filename - tags all documents listed in filename\n"
00165 "run tagsampler n - runs the current sampler on tagged documents only\n"
00166 "reset counts - resets the Markov chain occupation counts\n"
00167 "save counts filename - prints the occupation counts in filename\n"
00168 "save tagcounts filename - prints the occupation counts for tagged documents only\n"
00169 "\n"
00170 "info graph - displays some statistics about the web graph\n"
00171 "save graph filename - prints the web link graph in filename\n"
00172 "";
00173
00174 Talker::Talker(const char* ripper_name, const char* ripper_tempdir,
00175 WebLinkGraph *ripper_web, GraphBuilder *ripper_gb,
00176 bool pvm_is_master, int pvm_numtasks) throw (exception) {
00177 name = ripper_name;
00178 tempdir = ripper_tempdir;
00179 web = ripper_web;
00180 gb = ripper_gb;
00181
00182 r = gsl_rng_alloc(gsl_rng_taus);
00183 if( !r ) {
00184 cerr << "error: couldn't initialize the random number generator" << endl;
00185 throw exception();
00186 }
00187
00188
00189 prsampler = new PageRankSampler(web);
00190 prsampler->SetParameters(0.1);
00191
00192 dbsampler = new DateBiasedPageRankSampler(web);
00193 dbsampler->SetParameters(0.1,0.1,gb->HighestDate() - gb->LowestDate());
00194
00195 tksampler = new TruncatedKleinbergSampler(web);
00196 tksampler->SetParameters(0.1,hubs);
00197
00198 defaultsampler = NULL;
00199
00200 leaflist = new LeafNodePtrList();
00201
00202 pvminterface = NULL;
00203
00204 assert(gb);
00205 pvminterface = new PVMInterface(name,
00206 pvm_is_master, pvm_numtasks,
00207 gb->LowestID(), gb->HighestID());
00208
00209 if( pvm_numtasks > 0 ) {
00210
00211 if( pvm_is_master ) {
00212 SetupFIFOs();
00213 }
00214
00215 nodequiv << tempdir << "ripper.";
00216 if( *name ) {
00217 nodequiv << name << ".";
00218 }
00219 nodequiv << "nodes.equiv";
00220 remove(nodequiv.str().c_str());
00221
00222
00223 cerr << "info: now building leafnodes" << endl;
00224 for(int k = 0; k < pvminterface->NumberOfOtherTasks(); k++) {
00225 ostringstream fname;
00226 fname << tempdir << "ripper." << pvminterface->Name(k) << ".index";
00227 ifstream f(fname.str().c_str());
00228 if( !f.fail() ) {
00229 if( !LoadLeaves(f) ) {
00230 cerr << "warning: index file was not properly loaded" << endl;
00231 }
00232 f.close();
00233 } else {
00234 cerr << "error: failed to open index file " << fname << endl;
00235 }
00236 }
00237 } else {
00238 SetupFIFOs();
00239 }
00240 }
00241
00242 Talker::~Talker() {
00243 cerr << "info: quitting." << endl;
00244 RemoveFIFOs();
00245 delete pvminterface;
00246 delete prsampler;
00247 delete dbsampler;
00248 delete leaflist;
00249 }
00250
00251
00252
00253 void Talker::SetupFIFOs() throw (exception) {
00254 cerr << "info: setting up FIFOs" << endl;
00255
00256 inputfifo << tempdir << "ripper.";
00257 if( *name ) {
00258 inputfifo << name << ".";
00259 }
00260 inputfifo << "input";
00261
00262 if( mkfifo(inputfifo.str().c_str(), S_IWUSR|S_IRUSR ) < 0 ) {
00263 cerr << "couln't create FIFO, or FIFO already exists [" << inputfifo.str() << "]" << endl;
00264 throw exception();
00265 }
00266
00267 outputfifo << tempdir << "ripper.";
00268 if( *name ) {
00269 outputfifo << name << ".";
00270 }
00271 outputfifo << "output";
00272
00273 if( mkfifo(outputfifo.str().c_str(), S_IWUSR|S_IRUSR ) < 0 ) {
00274 cerr << "couln't create FIFO, or FIFO already exists [" << outputfifo.str() << "]" << endl;
00275 throw exception();
00276 }
00277
00278 }
00279
00280
00281 void Talker::RemoveFIFOs() {
00282 remove(inputfifo.str().c_str());
00283 remove(outputfifo.str().c_str());
00284 }
00285
00286
00287
00288
00289
00290
00291
00292 void Talker::Talk() throw (exception) {
00293
00294 ifstream in;
00295 ofstream out;
00296
00297 defaultsampler = prsampler;
00298
00299 if( (pvminterface->NumberOfTasks() == 0) || pvminterface->IsMaster() ) {
00300 cerr << "info: now listening for jack..." << endl;
00301
00302 in.open(inputfifo.str().c_str());
00303 out.open(outputfifo.str().c_str(),ios::nocreate|ios::app);
00304 if( !in || !out ) {
00305 cerr << "error: cannot open FIFO " << endl;
00306 throw exception();
00307 }
00308
00309
00310
00311
00312
00313 out << '\r';
00314 out.flush();
00315
00316 cerr << "info: now entered interactive mode." << endl;
00317 }
00318
00319
00320 char commandline[1024];
00321 bool quit = false;
00322
00323 while(!quit) {
00324 assert(pvminterface->AllStandby());
00325 memset(commandline,0,1024);
00326
00327
00328 if( (pvminterface->NumberOfTasks() == 0) || pvminterface->IsMaster() ) {
00329
00330 in.getline(commandline, 1024);
00331 cerr << "info: received command [" << commandline << "]" << endl;
00332
00333 if( *commandline == 0 ) {
00334 out << '\r';
00335 out.flush();
00336 continue;
00337 }
00338 }
00339
00340
00341
00342
00343
00344 do {
00345
00346 if( *commandline ) {
00347
00348 if(pvminterface->NumberOfTasks() > 0) {
00349 cerr << "pvm: received message [" << commandline << "]" << endl;
00350 }
00351 pvminterface->LeaveStandbyGroup();
00352 quit = ProcessCommand(commandline, out);
00353 pvminterface->JoinStandbyGroup();
00354 } else if( (pvminterface->NumberOfTasks() > 0) &&
00355 pvminterface->IsMaster() &&
00356 pvminterface->AllStandby() ) {
00357
00358 cerr << "pvm: master says finish_now" << endl;
00359 pvminterface->BroadcastCommand("pvmcontrol_finish_now");
00360 break;
00361 }
00362
00363
00364 pvminterface->GetCommandWithTimeout(commandline);
00365
00366 } while( !quit &&
00367 (pvminterface->NumberOfTasks() > 0) &&
00368 (strcmp(commandline,"pvmcontrol_finish_now") != 0) );
00369
00370
00371 if( (pvminterface->NumberOfTasks() == 0) || pvminterface->IsMaster() ) {
00372 out << '\r';
00373 out.flush();
00374 } else {
00375 cerr << "pvm: finished" << endl;
00376 }
00377 }
00378
00379
00380 if( (pvminterface->NumberOfTasks() == 0) || pvminterface->IsMaster() ) {
00381 in.close();
00382 out.close();
00383 }
00384 }
00385
00386
00387 bool greater_LeafCountsStruct(const LeafCountsStruct x, const LeafCountsStruct y) {
00388 return (x.id > y.id);
00389 }
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403 void Talker::PropagateLeafCounts() throw (invalid_argument) {
00404
00405 LeafCountsStruct* counts = new LeafCountsStruct[MAXLEAFMESSAGE_SIZE];
00406
00407 int taskno;
00408 long num_particles;
00409 do {
00410 taskno = -1;
00411 num_particles = 0;
00412 uint32 c = 0;
00413 for(LeafNodePtrList::iterator i = leaflist->begin();
00414 i != leaflist->end(); i++) {
00415 if( (*i)->OccupationCount() > 0 ) {
00416 if( taskno < 0 ) {
00417 if( (taskno = pvminterface->FindTask((*i)->ID())) > -1 ) {
00418 counts[c].id = (*i)->ID();
00419 counts[c].occupation_count = (*i)->OccupationCount();
00420 num_particles += (*i)->OccupationCount();
00421 (*i)->ClearOccupationCount();
00422 c++;
00423 }
00424 } else if ( ((*i)->ID() >= pvminterface->StartID(taskno)) &&
00425 ((*i)->ID() <= pvminterface->StopID(taskno)) ) {
00426 counts[c].id = (*i)->ID();
00427 counts[c].occupation_count = (*i)->OccupationCount();
00428 num_particles += (*i)->OccupationCount();
00429 (*i)->ClearOccupationCount();
00430 if( ++c == (MAXLEAFMESSAGE_SIZE - 1)) {
00431 break;
00432 }
00433 }
00434 }
00435 }
00436
00437 if( taskno > -1 ) {
00438
00439 cerr << "pvm: sending " << c << " LeafNodes to " << pvminterface->Name(taskno)
00440 << " (" << num_particles << " particles)" << endl;
00441 sort(counts,counts+c,greater_LeafCountsStruct);
00442
00443 if( sizeof(unsigned int) == sizeof(uint32) ) {
00444 uint32 size = sizeof(LeafCountsStruct)*c/sizeof(unsigned int);
00445 assert(size > 0);
00446 pvminterface->SendCommand(taskno, "pvm run sampler");
00447 pvminterface->SendLeafCounts(taskno, (unsigned int*)counts, size);
00448 } else {
00449 throw invalid_argument("sizeof(unsigned int) != sizeof(uint32)");
00450 }
00451 }
00452
00453 } while( (taskno > -1) && !pvminterface->MessagePending());
00454
00455 delete[] counts;
00456 }
00457
00458 bool Talker::ProcessCommand(char *commandline, ostream& out) {
00459 bool quit = false;
00460 char *t;
00461
00462 switch( GetToken(commandline, out, "try help", 9,
00463 "quit", "help", "info", "save",
00464 "reset", "set", "run", "load", "pvm") ) {
00465 case 0:
00466 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00467 pvminterface->BroadcastCommand("quit");
00468 }
00469 quit = true;
00470 break;
00471 case 1:
00472 out << helptext;
00473 break;
00474 case 2:
00475 switch( GetToken(NULL, out, "try info [mem|sampler|graph]", 3,
00476 "mem", "sampler", "graph") ) {
00477 case 0:
00478 PrintStatistics(out);
00479 break;
00480 case 1:
00481 char buf[100];
00482 out << "Current sampler is: " << defaultsampler->Name(buf) << endl;
00483 out << "Total perfect samples produced so far: " << static_cast<int>(defaultsampler->LastRunSize()) << endl;
00484 out << "Total tagged samples produced so far: " << static_cast<int>(defaultsampler->LastTaggedRunSize()) << endl;
00485 break;
00486 case 2:
00487 PrintStatisticsGraph(out);
00488 web->StatisticsFromSets(out);
00489 break;
00490 }
00491 break;
00492 case 3:
00493 switch( GetToken(NULL, out, "try save [graph|counts]", 3,
00494 "graph", "counts", "tagcounts") ) {
00495 case 0:
00496 if( (t = GetTokenVar(NULL, out,
00497 "try save graph filename")) ) {
00498 ofstream g(t);
00499 if( !g.fail() ) {
00500 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00501 ostringstream s;
00502 s << "pvm save graph " << t;
00503 pvminterface->BroadcastCommand(s.str().c_str());
00504 }
00505 web->PrintWebGraph(g);
00506 out << "graph stored in " << t << endl;
00507 g.close();
00508 } else {
00509 out << "Error: could not open file [" << t << "]" << endl;
00510 }
00511 }
00512 break;
00513 case 1:
00514 if( (t = GetTokenVar(NULL, out,
00515 "try save counts filename")) ) {
00516 ofstream g(t);
00517 if( !g.fail() ) {
00518 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00519 ostringstream s;
00520 s << "pvm save counts " << t;
00521 pvminterface->BroadcastCommand(s.str().c_str());
00522 }
00523 defaultsampler->PrintCounts(g);
00524 out << "counts stored in " << t << endl;
00525 g.close();
00526 } else {
00527 out << "Error: could not open file [" << t << "]" << endl;
00528 }
00529 }
00530 break;
00531 case 2:
00532 if( (t = GetTokenVar(NULL, out,
00533 "try save tagcounts filename")) ) {
00534 ofstream g(t);
00535 if( !g.fail() ) {
00536 defaultsampler->PrintTagCounts(g);
00537 out << "tagcounts stored in " << t << endl;
00538 g.close();
00539 } else {
00540 out << "Error: could not open file [" << t << "]" << endl;
00541 }
00542 }
00543 break;
00544 }
00545 break;
00546 case 4:
00547 switch( GetToken(NULL, out, "try reset counts", 2,
00548 "counts", "seed") ) {
00549 case 0:
00550 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00551 pvminterface->BroadcastCommand("reset counts");
00552 }
00553 defaultsampler->ClearCounts();
00554 break;
00555 case 1:
00556 if( (t = GetTokenVar(NULL, out,
00557 "try reset seed 0")) ) {
00558 unsigned long int sigma = atol(t);
00559 if( sigma >= 0 ) {
00560
00561 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00562 ostringstream s;
00563 s << "reset seed " << sigma;
00564 pvminterface->BroadcastCommand(s.str().c_str());
00565 }
00566
00567 defaultsampler->SetRandomSeed(sigma);
00568
00569 } else {
00570 out << "Token must be number > 0, try reset seed 0" << endl;
00571 }
00572 }
00573 break;
00574 }
00575 break;
00576 case 5:
00577 switch( GetToken(NULL, out, "try set sampler pagerank epsilon", 1,
00578 "sampler") ) {
00579 case 0:
00580 switch( GetToken(NULL, out, "try set sampler pagerank epsilon", 3,
00581 "pagerank", "dateb", "tkleinberg") ) {
00582 case 0:
00583 if( (t = GetTokenVar(NULL, out,
00584 "try set sampler pagerank 0.3")) ) {
00585 double epsilon = atof(t);
00586 if( (epsilon > 0) && (epsilon < 1) ) {
00587
00588 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00589 ostringstream s;
00590 s << "set sampler pagerank " << epsilon;
00591 pvminterface->BroadcastCommand(s.str().c_str());
00592 }
00593
00594 prsampler->SetParameters(epsilon);
00595 defaultsampler = prsampler;
00596
00597 } else {
00598 out << "Token must be 0 < epsilon < 1, try set sampler pagerank 0.3" << endl;
00599 }
00600 }
00601 break;
00602 case 1:
00603 if( (t = GetTokenVar(NULL, out,
00604 "try set sampler dateb 0.1 0.001")) ) {
00605 double epsilon = atof(t);
00606 if( (epsilon > 0) && (epsilon < 1) ) {
00607 if( (t = GetTokenVar(NULL, out,
00608 "try set sampler dateb 0.1 0.001")) ) {
00609 double lambda = atof(t);
00610 if( lambda > 0 ) {
00611
00612 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00613 ostringstream s;
00614 s << "set sampler dateb " << epsilon << " " << lambda;
00615 pvminterface->BroadcastCommand(s.str().c_str());
00616 }
00617
00618 dbsampler->SetParameters(epsilon,lambda,gb->HighestDate() - gb->LowestDate());
00619 defaultsampler = dbsampler;
00620
00621 } else {
00622 out << "Token must be lambda > 0, try set sampler dateb 0.1 0.001" << endl;
00623 }
00624 }
00625 } else {
00626 out << "Token must be 0 < epsilon < 1, try set sampler pagerank 0.3" << endl;
00627 }
00628 }
00629 break;
00630 case 2:
00631 if( leaflist->size() > 0 ) {
00632 out << "tkleinberg doesn't work with leafnodes/distributed computing" << endl;
00633 } else if( (t = GetTokenVar(NULL, out,
00634 "try set sampler tkleinberg 0.1 auth")) ) {
00635 double epsilon = atof(t);
00636 if( (epsilon > 0) && (epsilon < 1) ) {
00637 if( (t = GetTokenVar(NULL, out,
00638 "try set sampler tkleinberg 0.1 auth")) ) {
00639 ktype what = undef;
00640 if( (strcmp(t,"auth") == 0) ) {
00641 what = auth;
00642 } else if( (strcmp(t,"hubs") == 0) ) {
00643 what = hubs;
00644 } else {
00645 out << "Token must be hubs or auth, try set sampler tkleinberg 0.1 auth" << endl;
00646 }
00647 if( what != undef ) {
00648 tksampler->SetParameters(epsilon,what);
00649 defaultsampler = tksampler;
00650 }
00651 }
00652 } else {
00653 out << "Token must be 0 < epsilon < 1, try set sampler pagerank 0.3" << endl;
00654 }
00655 }
00656 break;
00657 }
00658 break;
00659 }
00660 break;
00661 case 6:
00662 switch( GetToken(NULL, out, "try sampler n", 2,
00663 "sampler", "tagsampler") ) {
00664 case 0:
00665 if( (t = GetTokenVar(NULL, out,
00666 "try run sampler 5000")) ) {
00667 long num_samples = atol(t);
00668 if( num_samples > 0 ) {
00669
00670
00671 if( (pvminterface->NumberOfTasks() > 0) && pvminterface->IsMaster() ) {
00672
00673 int totalgraphsize = web->size();
00674 for(int i = 0; i < pvminterface->NumberOfOtherTasks(); i++ ) {
00675 totalgraphsize += (pvminterface->StopID(i) - pvminterface->StartID(i));
00676 }
00677
00678
00679 for(int i = 0; i < pvminterface->NumberOfOtherTasks(); i++) {
00680 double p = static_cast<double>(pvminterface->StopID(i) - pvminterface->StartID(i))/totalgraphsize;
00681 long howmany = gsl_ran_binomial(r, p, num_samples);
00682 num_samples -= howmany;
00683 ostringstream s;
00684 s << "run sampler " << howmany;
00685 pvminterface->SendCommand(i,s.str().c_str());
00686 }
00687 cerr << "pvm: finished delegating, ready to: run sampler " << num_samples << endl;
00688 }
00689
00690 defaultsampler->SimulateForward(num_samples);
00691
00692 PropagateLeafCounts();
00693
00694 } else {
00695 out << "Token must be 0 < n < " << LONG_MAX << ", try run sampler 5000" << endl;
00696 }
00697 }
00698 break;
00699 case 1:
00700 if( (t = GetTokenVar(NULL, out,
00701 "try run tagsampler 5000")) ) {
00702 long num_samples = atol(t);
00703 if( (num_samples > 0) && (leaflist->size() <= 0)) {
00704 const uint32 *fromsetsizes = web->GetFromSetSizes();
00705 if( fromsetsizes[0] > 0 ) {
00706 defaultsampler->TaggedSimulateForward(num_samples, fromsetsizes);
00707 } else {
00708 out << "tagsampler cannot run. You must first tag some documents. See load tags command." << endl;
00709 }
00710 } else {
00711 out << "Token must be 0 < n < " << LONG_MAX << ", try run tagsampler 5000.\n"
00712 "Note: cannot run tagsampler with leafnodes." << endl;
00713 }
00714 }
00715 break;
00716 }
00717 break;
00718 case 7:
00719 switch( GetToken(NULL, out, "try load [index|tags]", 2,
00720 "index", "tags") ) {
00721 case 0:
00722 if( (t = GetTokenVar(NULL, out,
00723 "try load index filename")) ) {
00724 ifstream g(t);
00725 if( !g.fail() ) {
00726 if( !LoadLeaves(g) ) {
00727 out << "Warning: index file was not properly loaded - leaf nodes may have inconsistent states" << endl;
00728 }
00729 g.close();
00730 } else {
00731 out << "Error: could not open file [" << t << "]" << endl;
00732 }
00733 }
00734 break;
00735 case 1:
00736 if( (t = GetTokenVar(NULL, out,
00737 "try load tags filename")) ) {
00738 ifstream g(t);
00739 if( !g.fail() ) {
00740 web->ClearTags();
00741 if( !BuildTags(g) ) {
00742 out << "Warning: some tagged documents not recognized/in dataset will be ignored" << endl;
00743 }
00744 g.close();
00745 web->BuildFromSets(16);
00746 web->MeasureFromSets();
00747 } else {
00748 out << "Error: could not open file [" << t << "]" << endl;
00749 }
00750 }
00751 break;
00752 }
00753 break;
00754 case 8:
00755 switch( GetToken(NULL, out, "non-user command", 3,
00756 "run", "save", "cat") ) {
00757 case 0:
00758 switch( GetToken(NULL, out, "try sampler n", 1,
00759 "sampler") ) {
00760 case 0:
00761
00762 defaultsampler->ClearAllocForward();
00763
00764 LeafCountsStruct* counts = new LeafCountsStruct[MAXLEAFMESSAGE_SIZE];
00765 uint32 size;
00766
00767
00768 bool repeat = false;
00769 bool stop = false;
00770 do {
00771
00772
00773 size = sizeof(LeafCountsStruct)*MAXLEAFMESSAGE_SIZE/sizeof(unsigned int);
00774 pvminterface->GetLeafCounts((unsigned int*)counts, &size);
00775 size = sizeof(unsigned int)*size/sizeof(LeafCountsStruct);
00776 assert(size > 0);
00777
00778
00779
00780 assert(counts[0].id <= gb->HighestID());
00781 assert(counts[size-1].id >= gb->LowestID());
00782
00783 cerr << "pvm: loading " << size << " LeafNodes" << endl;
00784 for(uint32 c = 0; c < size; c++) {
00785 defaultsampler->IncrementAllocForward(counts[c].id - gb->LowestID(),
00786 counts[c].occupation_count);
00787 }
00788
00789 commandline[0] = 0;
00790
00791
00792 if( pvminterface->MessagePending() ) {
00793 pvminterface->GetCommandWithTimeout(commandline);
00794 assert(*commandline);
00795 repeat = (strncmp(commandline,"pvm run sampler", 15) == 0);
00796 } else {
00797 stop = true;
00798 }
00799
00800 } while( repeat && !stop);
00801 delete[] counts;
00802
00803 cerr << "pvm: now doing partial simulation" << endl;
00804
00805 defaultsampler->SimulateAllocForward();
00806
00807 PropagateLeafCounts();
00808
00809
00810
00811 if( *commandline && !repeat ) {
00812 return ProcessCommand(commandline,out);
00813 }
00814 break;
00815 }
00816 break;
00817 case 1:
00818 switch( GetToken(NULL, out, "non-user command", 3,
00819 "graph", "counts", "cat") ) {
00820 case 0:
00821 if( (t = GetTokenVar(NULL, out,
00822 "try save graph filename")) ) {
00823 ostringstream s;
00824 s << tempdir << "ripper." << name << ".graph";
00825 ofstream g(s.str().c_str());
00826 if( !g.fail() ) {
00827 web->PrintWebGraph(g);
00828 g.close();
00829 s.seekp(0);
00830 s << "pvm cat ripper." << name << ".graph " << t;
00831 pvminterface->SendCommand(pvminterface->LastMessageOriginator(), s.str().c_str());
00832 } else {
00833 cerr << "pvm: error: could not open " << s.str() << endl;
00834 }
00835 }
00836 break;
00837 case 1:
00838 if( (t = GetTokenVar(NULL, out,
00839 "try save graph filename")) ) {
00840 ostringstream s;
00841 s << tempdir << "ripper." << name << ".counts";
00842 ofstream g(s.str().c_str());
00843 if( !g.fail() ) {
00844 defaultsampler->PrintCounts(g);
00845 g.close();
00846 s.seekp(0);
00847 s << "pvm cat ripper." << name << ".counts " << t;
00848 pvminterface->SendCommand(pvminterface->LastMessageOriginator(), s.str().c_str());
00849 } else {
00850 cerr << "pvm: error: could not open " << s.str() << endl;
00851 }
00852 }
00853 break;
00854 }
00855 break;
00856 case 2:
00857 if( (t = GetTokenVar(NULL, out,
00858 "try pvm cat source dest")) ) {
00859 ostringstream s;
00860 s << "cat " << tempdir << t;
00861 if( (t = GetTokenVar(NULL, out,
00862 "try pvm cat source dest")) ) {
00863 s << " >> " << t;
00864 cerr << "pvm: " << s.str() << endl;
00865 if( system(s.str().c_str()) ) {
00866 out << "Couldn't execute command: " << s.str() << endl;
00867 }
00868 }
00869 }
00870 break;
00871 }
00872 break;
00873 }
00874
00875 commandline[0] = 0;
00876 return quit;
00877 }
00878
00879
00880
00881 void Talker::PrintStatistics(ostream& o) {
00882 o << "Number of documents processed: " << num_docs_processed << endl;
00883 gb->StatisticsMem(o);
00884 o << "\nTotal number of leafnodes added: " << leaflist->size() << endl;
00885 o << "Total heap needed for all leafnodes: " << sizeof(LeafNode)*leaflist->size() << endl;
00886 }
00887
00888
00889 void Talker::PrintStatisticsGraph(ostream& o) {
00890 gb->StatisticsGraph(o);
00891 }
00892
00893
00894
00895
00896
00897 bool Talker::BuildTags(ifstream& g) {
00898 string s;
00899 bool problems = false;
00900 while( g && !g.eof() ) {
00901 g >> s;
00902 if( s[0] == '#' ) {
00903 getline(g,s);
00904 continue;
00905 }
00906 WebNodePtr w = gb->FindWebNode(s.c_str());
00907 if( w ) {
00908 w->SetTag(0);
00909 } else {
00910 problems = true;
00911 cerr << "warning: unrecognized tag [" << s << "]" << endl;
00912 }
00913 }
00914 return !problems;
00915 }
00916
00917
00918
00919
00920
00921 int Talker::GetToken(char* command, ostream& out, char* message,
00922 int n, ...) {
00923 char *t = strtok(command, " ");
00924 if( !t ) {
00925 out << "Missing token, " << message << endl;
00926 } else {
00927 va_list ap;
00928 va_start(ap, n);
00929 for(int k = 0; k < n; k++) {
00930 if( strcmp(t, va_arg(ap, char*)) == 0 ) {
00931 va_end(ap);
00932 return k;
00933 }
00934 }
00935 va_end(ap);
00936 out << "Unrecognized token " << t << ", " << message << endl;
00937 }
00938 return -1;
00939 }
00940
00941
00942
00943
00944 char *Talker::GetTokenVar(char* command, ostream& out, char* message) {
00945 char *t = strtok(command, " ");
00946 if( !t ) {
00947 out << "Missing token, " << message << endl;
00948 } else if( strlen(t) > 0 ) {
00949 return t;
00950 } else {
00951 out << "Missing token, " << message << endl;
00952 }
00953 return NULL;
00954 }
00955
00956 typedef struct IDPair {
00957 uint32 one;
00958 uint32 two;
00959 };
00960
00961 #define EQUIV_SIZE 1024
00962
00963 bool Talker :: LoadLeaves(ifstream& g) {
00964 string alias,url;
00965 uint32 nodeID;
00966 uint16 date;
00967 bool problems = false;
00968
00969 gb->SetupLeafTable();
00970
00971
00972 IDPair *equivalences = new IDPair[EQUIV_SIZE];
00973 int equiv_pointer = 0;
00974 ofstream neq(nodequiv.str().c_str(),std::ios::app);
00975
00976 while( g && !g.eof() ) {
00977
00978 g >> url;
00979 if( url[0] == '#' ) {
00980 getline(g,url);
00981 continue;
00982 }
00983 g >> nodeID >> date;
00984 getline(g,alias);
00985
00986 WebNodePtr w = gb->FindWebNode(url.c_str());
00987 if( w ) {
00988
00989
00990 equivalences[equiv_pointer].one = w->ID();
00991 equivalences[equiv_pointer].two = nodeID;
00992 if( ++equiv_pointer >= EQUIV_SIZE ) {
00993 for(int z = 0; z < EQUIV_SIZE; z++) {
00994 neq << equivalences[z].one << " = " << equivalences[z].two << endl;
00995 }
00996 equiv_pointer = 0;
00997 }
00998 } else {
00999 ptrdiff_t key = gb->FindLeafNodeKey(url.c_str());
01000 if( key >= 0 ) {
01001 LeafNodePtr leaf = new LeafNode(nodeID);
01002 leaf->SetDate(date);
01003 gb->AddLeaf(key, leaf);
01004 leaflist->push_front(leaf);
01005 } else {
01006
01007
01008 }
01009 }
01010 }
01011
01012
01013 for(int z = 0; z < equiv_pointer; z++) {
01014 neq << equivalences[z].one << " = " << equivalences[z].two << endl;
01015 }
01016 delete[] equivalences;
01017 neq.close();
01018
01019 gb->UpdateLeafLinks();
01020 return !problems;
01021 }
01022