00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include <fstream>
00031 #include <sstream>
00032 #include <vector>
00033 #include "handler-parser.h"
00034 #include "repos-reader.h"
00035 #include "parseelt.h"
00036
00037
00038 #include <fstream>
00039 #include "graphbuilder.h"
00040 #include "linkgraph.h"
00041 #include "talker.h"
00042
00043 static void error(string errmsg) { cerr << errmsg << std::endl; exit(1); }
00044
00045 static void usage() {
00046 error("usage: ripper [--stop_after n] <handler options> "
00047 "{- | <repository files>}\n"
00048 "possible handler options are:\n"
00049 "--pvm_slave x declares this ripper as a slave in a group of x tasks\n"
00050 "--pvm_master x declares this ripper as the master in a group of x tasks\n"
00051 "--cat prints the full contents of each document\n"
00052 "--caturl prints the url of each document\n"
00053 "--catlinks prints the url and tolinks of each document\n"
00054 "--no_graphbuilder doesn't build the web link graph\n"
00055 "--start_after x starts numbering documents from x\n"
00056 "--stop_after x stops processing after document number x\n"
00057 "--save_graph prints the web link graph\n"
00058 "--save_index prints an index of the documents read\n"
00059 "--string_memory x reserves x Megabytes for all url strings\n"
00060 "--jumptable_memory x reserves x Megabytes for all trie branch pointers\n"
00061 "--nodetable_memory x reserves x Megabytes for the webnode hashtable\n"
00062 "--leaftable_memory x reserves x Megabytes for the leafnode hashtable\n"
00063 "--name arthurdent names this ripper arthurdent - no quotes please\n"
00064 "--temp_dir z uses z as the temporary directory. Default is /tmp/\n"
00065 "--quit noninteractive mode.\n"
00066 );
00067 }
00068
00069 static char* defaultrippername = "";
00070 static char* defaulttempdir = "/tmp/";
00071 int num_docs_processed = 0;
00072
00073 #define RIPPER_NAMELEN 20
00074 #define RIPPER_TMPDIRLEN 20
00075
00076
00077 class Ripper {
00078 public:
00079 Ripper();
00080 ~Ripper();
00081 void SetupHandlers();
00082 void ParseCmdLineArgs(int argc, char** argv);
00083 void RipRepository (ReposReader* rr);
00084 void PrintStatistics(ostream& o);
00085
00086 WebLinkGraph* PublishWebGraph() { return gb->UndockWebGraph(); }
00087 GraphBuilder* GetGraphBuilder() { return gb; }
00088
00089 vector<string> rep_files_;
00090
00091
00092 struct {
00093 int stop_after;
00094 int string_memory;
00095 int jumptable_memory;
00096 int nodetable_memory;
00097 int leaftable_memory;
00098 int start_ID;
00099 int pvm_numtasks;
00100 bool pvm_is_master;
00101 bool repos_from_stdin;
00102 bool interactive;
00103 bool print_index;
00104 bool handler_cat;
00105 bool handler_caturl;
00106 bool handler_catdate;
00107 bool handler_graph_print;
00108 bool handler_catlinks;
00109 bool no_graphbuilder;
00110 } flags_;
00111
00112 char *rippername;
00113 char *tempdir;
00114 ofstream indexout;
00115
00116 private:
00117
00118 vector<ParseHandler*> parsehandlers_;
00119
00120 GraphBuilder* gb;
00121 };
00122
00123 Ripper::Ripper() {
00124 flags_.stop_after = 0;
00125 flags_.start_ID = 0;
00126
00127 flags_.string_memory = 45;
00128 flags_.jumptable_memory = 35;
00129 flags_.nodetable_memory = 10;
00130 flags_.leaftable_memory = 10;
00131
00132 flags_.interactive = true;
00133 flags_.no_graphbuilder = false;
00134 flags_.repos_from_stdin = false;
00135
00136 flags_.handler_cat = false;
00137 flags_.handler_caturl = false;
00138 flags_.handler_catdate = false;
00139 flags_.handler_graph_print = false;
00140 flags_.handler_catlinks = false;
00141
00142 flags_.print_index = false;
00143
00144 flags_.pvm_is_master = true;
00145 flags_.pvm_numtasks = 0;
00146
00147 rippername = defaultrippername;
00148 tempdir = defaulttempdir;
00149
00150
00151 gb = NULL;
00152 }
00153
00154 Ripper::~Ripper() {
00155 delete gb;
00156
00157 for (vector<ParseHandler*>::iterator ph = parsehandlers_.begin();
00158 ph != parsehandlers_.end(); ++ph)
00159 delete *ph;
00160 }
00161
00162 void Ripper::SetupHandlers () {
00163
00164 if(!flags_.no_graphbuilder) {
00165 gb = new GraphBuilder(flags_.string_memory, flags_.jumptable_memory,
00166 flags_.nodetable_memory, flags_.leaftable_memory,
00167 flags_.handler_catlinks);
00168 }
00169
00170 if (flags_.handler_cat) {
00171 extern ParseHandler* MakeCatHandler();
00172 parsehandlers_.push_back(MakeCatHandler());
00173 }
00174
00175 if (flags_.handler_caturl) {
00176 extern ParseHandler* MakeCatURLHandler();
00177 parsehandlers_.push_back(MakeCatURLHandler());
00178 }
00179
00180 if (flags_.handler_catdate) {
00181 extern ParseHandler* MakeCatDateHandler();
00182 parsehandlers_.push_back(MakeCatDateHandler());
00183 }
00184
00185 if (!flags_.no_graphbuilder) {
00186 extern ParseHandler* MakeGraphHandler(GraphBuilder *graphbuilder);
00187 parsehandlers_.push_back(MakeGraphHandler(gb));
00188 }
00189
00190 if (flags_.print_index) {
00191 if( !*rippername || (strlen(rippername) > RIPPER_NAMELEN) ) {
00192 cerr << "warning: index won't be saved. You must give ripper a short name." << endl;
00193 usage();
00194 } else {
00195
00196 ostringstream nambuf;
00197 nambuf << tempdir << "ripper." << rippername << ".index";
00198 cerr << "info: index saved in file " << nambuf.str() << endl;
00199 indexout.open(nambuf.str().c_str());
00200 if( indexout.fail() ) {
00201 cerr << "error: problem opening index file [" << nambuf.str() << "]" << endl;
00202 }
00203 }
00204 }
00205 if (parsehandlers_.size() == 0) {
00206 usage();
00207 }
00208
00209 }
00210
00211
00212
00213 void Ripper::RipRepository (ReposReader* rr) {
00214 if( flags_.no_graphbuilder ) {
00215
00216 while (!rr->AtEnd() &&
00217 (flags_.stop_after == 0 || num_docs_processed < flags_.stop_after)) {
00218 ParseElt::Process_Document(rr, &parsehandlers_);
00219 num_docs_processed++;
00220 }
00221
00222 } else {
00223
00224 while (!rr->AtEnd() &&
00225 (flags_.stop_after == 0 || num_docs_processed < flags_.stop_after)) {
00226 gb->NodeInitialize(num_docs_processed);
00227 ParseElt::Process_Document(rr, &parsehandlers_);
00228 gb->NodeInsertLinks();
00229 if( flags_.print_index && indexout) {
00230 indexout << gb->NodeGetURL() << " "
00231 << gb->NodeGetID() << " "
00232 << gb->NodeGetDate() << " " << gb->NodeGetAlias()
00233
00234 << " ** " << gb->NodeGetURL_() << " " << ((gb->NodeGetAlias_()) ? gb->NodeGetAlias_() : "")
00235 << endl;
00236 }
00237 gb->NodeLaunch();
00238 num_docs_processed++;
00239 }
00240
00241 }
00242 }
00243
00244
00245
00246 void Ripper::PrintStatistics(ostream& o) {
00247 o << "Number of documents processed: " << num_docs_processed << endl;
00248 if(gb)
00249 gb->StatisticsMem(o);
00250 }
00251
00252
00253 void Ripper::ParseCmdLineArgs(int argc, char** argv) {
00254 argv++;
00255 for (int i = 1; i < argc; ++i, ++argv) {
00256 if ((*argv)[0] == '-' && (*argv)[1] != '\0') {
00257 if (!strcmp(*argv, "--stop_after")) {
00258 ++i;
00259 ++argv;
00260 if (i >= argc) {
00261 usage();
00262 }
00263 flags_.stop_after = atoi(*argv);
00264 } else if (!strcmp(*argv, "--cat")) {
00265 flags_.handler_cat = true;
00266 } else if (!strcmp(*argv, "--caturl")) {
00267 flags_.handler_caturl = true;
00268 }
00269
00270 else if (!strcmp(*argv, "--no_graphbuilder")) {
00271 flags_.no_graphbuilder = true;
00272 } else if (!strcmp(*argv, "--catdate")) {
00273 flags_.handler_catdate = true;
00274 } else if (!strcmp(*argv, "--quit")) {
00275 flags_.interactive = false;
00276 } else if (!strcmp(*argv, "--catlinks")) {
00277 flags_.handler_catlinks = true;
00278 } else if (!strcmp(*argv, "--save_graph")) {
00279 flags_.handler_graph_print = true;
00280 } else if (!strcmp(*argv, "--save_index")) {
00281 flags_.print_index = true;
00282 } else if (!strcmp(*argv, "--string_memory")) {
00283 ++i;
00284 ++argv;
00285 if (i >= argc) {
00286 usage();
00287 }
00288 flags_.string_memory = atoi(*argv);
00289 } else if (!strcmp(*argv, "--jumptable_memory")) {
00290 ++i;
00291 ++argv;
00292 if (i >= argc) {
00293 usage();
00294 }
00295 flags_.jumptable_memory = atoi(*argv);
00296 } else if (!strcmp(*argv, "--nodetable_memory")) {
00297 ++i;
00298 ++argv;
00299 if (i >= argc) {
00300 usage();
00301 }
00302 flags_.nodetable_memory = atoi(*argv);
00303 } else if (!strcmp(*argv, "--leaftable_memory")) {
00304 ++i;
00305 ++argv;
00306 if (i >= argc) {
00307 usage();
00308 }
00309 flags_.leaftable_memory = atoi(*argv);
00310 } else if (!strcmp(*argv, "--name")) {
00311 ++i;
00312 ++argv;
00313 if (i >= argc) {
00314 usage();
00315 }
00316 rippername = *argv;
00317 if( strlen(rippername) > RIPPER_NAMELEN) {
00318 cerr << "Name too long, please use a shorter one." << endl;
00319 usage();
00320 }
00321 } else if (!strcmp(*argv, "--temp_dir")) {
00322 ++i;
00323 ++argv;
00324 if (i >= argc) {
00325 usage();
00326 }
00327 tempdir = *argv;
00328 if( strlen(tempdir) > RIPPER_TMPDIRLEN) {
00329 cerr << "Please use a shorter temporary directory name." << endl;
00330 usage();
00331 }
00332 } else if (!strcmp(*argv, "--pvm_master")) {
00333 ++i;
00334 ++argv;
00335 if (i >= argc) {
00336 usage();
00337 }
00338 flags_.pvm_numtasks = atoi(*argv);
00339 flags_.pvm_is_master = true;
00340 } else if (!strcmp(*argv, "--pvm_slave")) {
00341 ++i;
00342 ++argv;
00343 if (i >= argc) {
00344 usage();
00345 }
00346 flags_.pvm_numtasks = atoi(*argv);
00347 flags_.pvm_is_master = false;
00348 } else if (!strcmp(*argv, "--start_after")) {
00349 ++i;
00350 ++argv;
00351 if (i >= argc) {
00352 usage();
00353 }
00354 flags_.start_ID = atoi(*argv);
00355 num_docs_processed = flags_.start_ID;
00356 } else
00357 usage();
00358 } else {
00359 if ((*argv)[0] == '-' && (*argv)[1] == '\0') {
00360 flags_.repos_from_stdin = true;
00361 } else {
00362 rep_files_.push_back(string(*argv));
00363 }
00364 }
00365 }
00366 if (!flags_.repos_from_stdin && rep_files_.empty()) {
00367 usage();
00368 }
00369 if (flags_.repos_from_stdin && !rep_files_.empty()) {
00370 cerr << "Specify only one source of repository input (files or stdin)"
00371 << std::endl;
00372 usage();
00373 }
00374 }
00375
00376 Ripper *ripper = NULL;
00377
00378
00379 void OutOfMemory() {
00380 cerr << "ran out of memory" << endl;
00381 if( ripper ) {
00382 ripper->PrintStatistics(cerr);
00383 }
00384 abort();
00385 }
00386
00387
00388
00389
00390
00391
00392
00393
00394 int main(int argc, char** argv) {
00395
00396
00397 set_new_handler(OutOfMemory);
00398 ripper = new Ripper();
00399
00400 cerr << "Welcome to the Google Programming Contest ripper." << std::endl
00401 << "Please see the file LICENSE for terms of use of "
00402 << "the data and code." << std::endl;
00403
00404 ripper->ParseCmdLineArgs(argc, argv);
00405 ripper->SetupHandlers();
00406
00407
00408 if (!ripper->flags_.no_graphbuilder &&
00409 ripper->flags_.print_index && ripper->indexout ) {
00410 ripper->indexout << "# Ripper: " << ripper->rippername << endl
00411 << "# url | id | date | alias_url" << endl;
00412 }
00413
00414
00415 if (ripper->flags_.repos_from_stdin) {
00416 ReposReader reprdr(&cin, string("<stdin>"));
00417 ripper->RipRepository(&reprdr);
00418 } else {
00419 for (vector<string>::iterator repname = ripper->rep_files_.begin();
00420 repname != ripper->rep_files_.end(); ++repname) {
00421 std::ifstream repstream((*repname).c_str());
00422 if (! repstream) {
00423 cerr << "error: Cannot open repository file " << *repname
00424 << ", skipping it" << std::endl;
00425 } else {
00426 ReposReader reprdr(&repstream, *repname);
00427 ripper->RipRepository(&reprdr);
00428 }
00429 repstream.close();
00430 }
00431 }
00432
00433
00434 if (!ripper->flags_.no_graphbuilder &&
00435 ripper->flags_.print_index && ripper->indexout ) {
00436 ripper->indexout.close();
00437 }
00438
00439
00440 if(!ripper->flags_.no_graphbuilder && ripper->flags_.interactive) {
00441
00442 cerr << "tempdir:" << ripper->tempdir <<endl;
00443 Talker *talker = new Talker(ripper->rippername, ripper->tempdir,
00444 ripper->PublishWebGraph(),
00445 ripper->GetGraphBuilder(),
00446 ripper->flags_.pvm_is_master,
00447 ripper->flags_.pvm_numtasks);
00448
00449 talker->Talk();
00450 delete talker;
00451
00452 } else {
00453 if( !ripper->flags_.no_graphbuilder &&
00454 ripper->flags_.handler_graph_print ) {
00455 WebLinkGraph *graph = ripper->PublishWebGraph();
00456 graph->PrintWebGraph(cout);
00457 }
00458 ripper->PrintStatistics(cerr);
00459 }
00460
00461 return 0;
00462 }
00463