00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include <stdio.h>
00019 #include <vector>
00020 #include "parseelt.h"
00021 #include "parsehandler-preparse.h"
00022
00023
00024
00025 #define CALL0(method) \
00026 for ( vector<ParseHandler*>::iterator cur_ph = ph->begin(); \
00027 cur_ph != ph->end(); ++cur_ph ) \
00028 (*cur_ph)->method();
00029
00030
00031 #define CALL1(method,arg) \
00032 for ( vector<ParseHandler*>::iterator cur_ph = ph->begin(); \
00033 cur_ph != ph->end(); ++cur_ph ) \
00034 (*cur_ph)->method(arg);
00035
00036
00037 #define CALL2(method,arg1,arg2) \
00038 for ( vector<ParseHandler*>::iterator cur_ph = ph->begin(); \
00039 cur_ph != ph->end(); ++cur_ph ) \
00040 (*cur_ph)->method(arg1, arg2);
00041
00042
00043 #define CALL4(method,arg1,arg2,arg3,arg4) \
00044 for ( vector<ParseHandler*>::iterator cur_ph = ph->begin(); \
00045 cur_ph != ph->end(); ++cur_ph ) \
00046 (*cur_ph)->method(arg1,arg2,arg3,arg4);
00047
00048 string IntToString(int i, const char* format) {
00049 char buf[80];
00050 snprintf(buf, sizeof(buf), format, i);
00051 return string(buf);
00052 }
00053
00054
00055
00056
00057 const ParseElt::ParseEltMethod ParseElt::kParseCodeTable[] = {
00058 NULL,
00059 &ParseElt::Process_Header,
00060 &ParseElt::Process_ResponseCode,
00061 &ParseElt::Process_BaseURL,
00062 &ParseElt::Process_Anchor,
00063 &ParseElt::Process_LocalName,
00064 &ParseElt::Process_AnchorDone,
00065 &ParseElt::Process_ChangeFontColor,
00066 &ParseElt::Process_ChangeFontColorEnd,
00067 &ParseElt::Process_ChangeBGColor,
00068 &ParseElt::Process_ChangeBGColorEnd,
00069 &ParseElt::Process_Image,
00070 NULL,
00071 NULL,
00072 &ParseElt::Process_Applet,
00073 &ParseElt::Process_AppletDone,
00074 &ParseElt::Process_IFrame,
00075 &ParseElt::Process_IFrameDone,
00076 &ParseElt::Process_Frame,
00077 &ParseElt::Process_Area,
00078 &ParseElt::Process_Meta,
00079 &ParseElt::Process_Frameset,
00080 &ParseElt::Process_FramesetDone,
00081 &ParseElt::Process_Body,
00082 &ParseElt::Process_BodyDone,
00083 &ParseElt::Process_ParagraphStart,
00084 &ParseElt::Process_ParagraphEnd,
00085 &ParseElt::Process_Break,
00086 &ParseElt::Process_HorizontalRule,
00087 &ParseElt::Process_ListItem,
00088 &ParseElt::Process_UnorderedList,
00089 &ParseElt::Process_OrderedList,
00090 &ParseElt::Process_ListDone,
00091 &ParseElt::Process_Div,
00092 &ParseElt::Process_DivDone,
00093 &ParseElt::Process_Span,
00094 &ParseElt::Process_SpanDone,
00095 &ParseElt::Process_Table,
00096 &ParseElt::Process_TableDone,
00097 &ParseElt::Process_Caption,
00098 &ParseElt::Process_CaptionDone,
00099 &ParseElt::Process_TableHCell,
00100 &ParseElt::Process_TableDCell,
00101 &ParseElt::Process_TableCellDone,
00102 &ParseElt::Process_TableRow,
00103 &ParseElt::Process_TableRowDone,
00104 &ParseElt::Process_Form,
00105 &ParseElt::Process_FormDone,
00106 &ParseElt::Process_Select,
00107 &ParseElt::Process_SelectDone,
00108 &ParseElt::Process_Option,
00109 &ParseElt::Process_OptionDone,
00110 &ParseElt::Process_TextArea,
00111 &ParseElt::Process_TextAreaDone,
00112 &ParseElt::Process_Input,
00113 &ParseElt::Process_Heading,
00114 &ParseElt::Process_HeadingDone,
00115 &ParseElt::Process_Noframes,
00116 &ParseElt::Process_NoframesDone,
00117 &ParseElt::Process_Object,
00118 &ParseElt::Process_ObjectDone,
00119 &ParseElt::Process_Param,
00120 &ParseElt::Process_Embed,
00121 &ParseElt::Process_Head,
00122 &ParseElt::Process_HeadDone,
00123 &ParseElt::Process_SetFace,
00124 &ParseElt::Process_SetSize
00125 };
00126
00127 char ParseElt::cbuf1_[kTextBufSize * 10];
00128 char ParseElt::cbuf2_[kTextBufSize];
00129 int ParseElt::cur_face_ = 0;
00130 int ParseElt::cur_size_ = 0;
00131 Document ParseElt::doc_ = Document();
00132
00133 void ParseElt::Process_Document(ReposReader* rr, HandlerVec ph) {
00134 char urlbuf[1024];
00135 char reurlbuf[1024];
00136 int len;
00137
00138 doc_.Clear();
00139
00140
00141 const char* p = rr->ReadCharStar(&len);
00142 assert((unsigned)len < sizeof(urlbuf));
00143 strncpy(urlbuf, p, len);
00144 urlbuf[len] = '\0';
00145 doc_.set_url(urlbuf);
00146
00147 p = rr->ReadCharStar(&len);
00148 assert((unsigned)len < sizeof(reurlbuf));
00149 strncpy(reurlbuf, p, len);
00150 reurlbuf[len] = '\0';
00151 doc_.set_url_after_redirects(reurlbuf);
00152
00153 struct in_addr ip_addr;
00154 ip_addr.s_addr = rr->ReadFixedUint32();
00155 doc_.set_ip_addr(ip_addr);
00156
00157 doc_.set_content_type(ContentType(rr->ReadInt()));
00158 doc_.set_content_len(rr->ReadVarUint32());
00159 doc_.set_language(Language(rr->ReadInt()));
00160 doc_.set_encoding(Encoding(rr->ReadInt()));
00161
00162 CALL1(NewDocument, &doc_);
00163
00164 unsigned char pc;
00165 if ((pc = rr->ReadByte()) != kBeginDocMarker) {
00166 rr->ParseError("expecting BeginDoc code");
00167 }
00168 while ((pc = rr->ReadByte()) != kEndDocMarker) {
00169 if (IS_TERM_OR_PUNCT(pc)) {
00170 Process_TermOrPunct(pc, rr, ph);
00171 } else {
00172 if (pc > kParseElt_LASTCODE) {
00173 rr->ParseError("found invalid parse code 0x"
00174 + IntToString(static_cast<int>(pc), "%2x"));
00175 }
00176
00177 assert(*kParseCodeTable[pc] != NULL);
00178 (*kParseCodeTable[pc])(pc, rr, ph);
00179 }
00180 }
00181 CALL1(EndDocument, &doc_);
00182 }
00183
00184
00185 void ParseElt::Process_Header(int pc, ReposReader* rr, HandlerVec ph) {
00186 assert(pc == kParseElt_Header);
00187
00188 int keylen;
00189 int valuelen;
00190
00191 const char* p = rr->ReadCharStar(&keylen);
00192 assert((unsigned)keylen <= sizeof(cbuf1_));
00193 strncpy(cbuf1_, p, keylen);
00194
00195 const char* value = rr->ReadCharStar(&valuelen);
00196
00197 CALL4(AddHeader, cbuf1_, keylen, value, valuelen);
00198 }
00199
00200
00201 void ParseElt::Process_ResponseCode(int pc, ReposReader* rr, HandlerVec ph) {
00202 assert(pc == kParseElt_ResponseCode);
00203 int arg1 = rr->ReadInt();
00204 CALL1(AddResponseCode, arg1);
00205 }
00206
00207 void ParseElt::Process_BaseURL(int pc, ReposReader* rr, HandlerVec ph) {
00208 assert(pc == kParseElt_BaseURL);
00209
00210 int len;
00211 const char* p = rr->ReadCharStar(&len);
00212 CALL2(AddBaseURL, p, len);
00213 }
00214
00215 void ParseElt::Process_Anchor(int pc, ReposReader* rr, HandlerVec ph) {
00216 assert(pc == kParseElt_Anchor);
00217
00218 int len;
00219 const char* p = rr->ReadCharStar(&len);
00220 CALL2(AddAnchor, p, len);
00221 }
00222
00223 void ParseElt::Process_LocalName(int pc, ReposReader* rr, HandlerVec ph) {
00224 assert(pc == kParseElt_LocalName);
00225 int len;
00226 const char* p = rr->ReadCharStar(&len);
00227 CALL2(AddLocalName, p, len);
00228 }
00229
00230 void ParseElt::Process_AnchorDone(int pc, ReposReader* rr, HandlerVec ph) {
00231 assert(pc == kParseElt_AnchorDone);
00232 CALL0(AnchorDone);
00233 }
00234
00235 void ParseElt::Process_ChangeFontColor(int pc, ReposReader* rr,
00236 HandlerVec ph) {
00237 assert(pc == kParseElt_ChangeFontColor);
00238 int len;
00239 const char* p = rr->ReadCharStar(&len);
00240 CALL2(ChangeFontColor, p, len);
00241 }
00242
00243 void ParseElt::Process_ChangeFontColorEnd(int pc, ReposReader* rr,
00244 HandlerVec ph) {
00245 assert(pc == kParseElt_ChangeFontColorEnd);
00246 CALL0(ChangeFontColorEnd);
00247 }
00248
00249 void ParseElt::Process_ChangeBGColor(int pc, ReposReader* rr, HandlerVec ph) {
00250 assert(pc == kParseElt_ChangeBGColor);
00251 int len;
00252 const char* p = rr->ReadCharStar(&len);
00253 CALL2(ChangeBGColor, p, len);
00254 }
00255
00256 void ParseElt::Process_ChangeBGColorEnd(int pc, ReposReader* rr,
00257 HandlerVec ph) {
00258 assert(pc == kParseElt_ChangeBGColorEnd);
00259 CALL0(ChangeBGColorEnd);
00260 }
00261
00262 void ParseElt::Process_Image(int pc, ReposReader* rr, HandlerVec ph) {
00263 assert(pc == kParseElt_Image);
00264 int len;
00265 const char* p = rr->ReadCharStar(&len);
00266 CALL2(AddImage, p, len);
00267 }
00268
00269 void ParseElt::Process_Applet(int pc, ReposReader* rr, HandlerVec ph) {
00270 assert(pc == kParseElt_Applet);
00271 int len;
00272 const char* p = rr->ReadCharStar(&len);
00273 CALL2(AddApplet, p, len);
00274 }
00275
00276 void ParseElt::Process_AppletDone(int pc, ReposReader* rr, HandlerVec ph) {
00277 assert(pc == kParseElt_AppletDone);
00278 CALL0(AddAppletDone);
00279 }
00280
00281 void ParseElt::Process_IFrame(int pc, ReposReader* rr, HandlerVec ph) {
00282 assert(pc == kParseElt_IFrame);
00283 int len;
00284 const char* p = rr->ReadCharStar(&len);
00285 CALL2(AddIFrame, p, len);
00286 }
00287
00288 void ParseElt::Process_IFrameDone(int pc, ReposReader* rr, HandlerVec ph) {
00289 assert(pc == kParseElt_IFrameDone);
00290 CALL0(AddIFrameDone);
00291 }
00292
00293 void ParseElt::Process_Frame(int pc, ReposReader* rr, HandlerVec ph) {
00294 assert(pc == kParseElt_Frame);
00295 int len;
00296 const char* p = rr->ReadCharStar(&len);
00297 CALL2(AddFrame, p, len);
00298 }
00299
00300 void ParseElt::Process_Area(int pc, ReposReader* rr, HandlerVec ph) {
00301 assert(pc == kParseElt_Area);
00302 int len;
00303 const char* p = rr->ReadCharStar(&len);
00304 CALL2(AddArea, p, len);
00305 }
00306
00307 void ParseElt::Process_Meta(int pc, ReposReader* rr, HandlerVec ph) {
00308 assert(pc == kParseElt_Meta);
00309 int len;
00310 const char* p = rr->ReadCharStar(&len);
00311 CALL2(AddMeta, p, len);
00312 }
00313
00314 void ParseElt::Process_Frameset(int pc, ReposReader* rr, HandlerVec ph) {
00315 assert(pc == kParseElt_Frameset);
00316 int len;
00317 const char* p = rr->ReadCharStar(&len);
00318 CALL2(AddFrameset, p, len);
00319 }
00320
00321 void ParseElt::Process_FramesetDone(int pc, ReposReader* rr, HandlerVec ph) {
00322 assert(pc == kParseElt_FramesetDone);
00323 CALL0(AddFramesetDone);
00324 }
00325
00326 void ParseElt::Process_Body(int pc, ReposReader* rr, HandlerVec ph) {
00327 assert(pc == kParseElt_Body);
00328 int len;
00329 const char* p = rr->ReadCharStar(&len);
00330 CALL2(AddBody, p, len);
00331 }
00332
00333 void ParseElt::Process_BodyDone(int pc, ReposReader* rr, HandlerVec ph) {
00334 assert(pc == kParseElt_BodyDone);
00335 CALL0(AddBodyDone);
00336 }
00337
00338 void ParseElt::Process_ParagraphStart(int pc, ReposReader* rr, HandlerVec ph) {
00339 assert(pc == kParseElt_ParagraphStart);
00340 int len;
00341 const char* p = rr->ReadCharStar(&len);
00342 CALL2(ParagraphStart, p, len);
00343 }
00344
00345 void ParseElt::Process_ParagraphEnd(int pc, ReposReader* rr, HandlerVec ph) {
00346 assert(pc == kParseElt_ParagraphEnd);
00347 CALL0(ParagraphEnd);
00348 }
00349
00350 void ParseElt::Process_Break(int pc, ReposReader* rr, HandlerVec ph) {
00351 assert(pc == kParseElt_Break);
00352 CALL0(AddBreak);
00353 }
00354
00355 void ParseElt::Process_HorizontalRule(int pc, ReposReader* rr, HandlerVec ph) {
00356 assert(pc == kParseElt_HorizontalRule);
00357 CALL0(AddHorizontalRule);
00358 }
00359
00360 void ParseElt::Process_ListItem(int pc, ReposReader* rr, HandlerVec ph) {
00361 assert(pc == kParseElt_ListItem);
00362 CALL0(AddListItem);
00363 }
00364
00365 void ParseElt::Process_UnorderedList(int pc, ReposReader* rr, HandlerVec ph) {
00366 assert(pc == kParseElt_UnorderedList);
00367 CALL0(AddUnorderedList);
00368 }
00369
00370 void ParseElt::Process_OrderedList(int pc, ReposReader* rr, HandlerVec ph) {
00371 assert(pc == kParseElt_OrderedList);
00372 CALL0(AddOrderedList);
00373 }
00374
00375 void ParseElt::Process_ListDone(int pc, ReposReader* rr, HandlerVec ph) {
00376 assert(pc == kParseElt_ListDone);
00377 CALL0(AddListDone);
00378 }
00379
00380 void ParseElt::Process_Div(int pc, ReposReader* rr, HandlerVec ph) {
00381 assert(pc == kParseElt_Div);
00382 int len;
00383 const char* p = rr->ReadCharStar(&len);
00384 CALL2(AddDiv, p, len);
00385 }
00386
00387 void ParseElt::Process_DivDone(int pc, ReposReader* rr, HandlerVec ph) {
00388 assert(pc == kParseElt_DivDone);
00389 int len;
00390 const char* p = rr->ReadCharStar(&len);
00391 CALL2(AddDivDone, p, len);
00392 }
00393
00394 void ParseElt::Process_Span(int pc, ReposReader* rr, HandlerVec ph) {
00395 assert(pc == kParseElt_Span);
00396 int len;
00397 const char* p = rr->ReadCharStar(&len);
00398 CALL2(AddSpan, p, len);
00399 }
00400
00401 void ParseElt::Process_SpanDone(int pc, ReposReader* rr, HandlerVec ph) {
00402 assert(pc == kParseElt_SpanDone);
00403 int len;
00404 const char* p = rr->ReadCharStar(&len);
00405 CALL2(AddSpanDone, p, len);
00406 }
00407
00408 void ParseElt::Process_Table(int pc, ReposReader* rr, HandlerVec ph) {
00409 assert(pc == kParseElt_Table);
00410 CALL0(AddTable);
00411 }
00412
00413 void ParseElt::Process_TableDone(int pc, ReposReader* rr, HandlerVec ph) {
00414 assert(pc == kParseElt_TableDone);
00415 CALL0(AddTableDone);
00416 }
00417
00418 void ParseElt::Process_Caption(int pc, ReposReader* rr, HandlerVec ph) {
00419 assert(pc == kParseElt_Caption);
00420 CALL0(AddCaption);
00421 }
00422
00423 void ParseElt::Process_CaptionDone(int pc, ReposReader* rr, HandlerVec ph) {
00424 assert(pc == kParseElt_CaptionDone);
00425 CALL0(AddCaptionDone);
00426 }
00427
00428 void ParseElt::Process_TableHCell(int pc, ReposReader* rr, HandlerVec ph) {
00429 assert(pc == kParseElt_TableHCell);
00430 int len;
00431 const char* p = rr->ReadCharStar(&len);
00432 CALL2(AddTableHCell, p, len);
00433 }
00434
00435 void ParseElt::Process_TableDCell(int pc, ReposReader* rr, HandlerVec ph) {
00436 assert(pc == kParseElt_TableDCell);
00437 int len;
00438 const char* p = rr->ReadCharStar(&len);
00439 CALL2(AddTableDCell, p, len);
00440 }
00441
00442 void ParseElt::Process_TableCellDone(int pc, ReposReader* rr, HandlerVec ph) {
00443 assert(pc == kParseElt_TableCellDone);
00444 CALL0(AddTableCellDone);
00445 }
00446
00447 void ParseElt::Process_TableRow(int pc, ReposReader* rr, HandlerVec ph) {
00448 assert(pc == kParseElt_TableRow);
00449 CALL0(AddTableRow);
00450 }
00451
00452 void ParseElt::Process_TableRowDone(int pc, ReposReader* rr, HandlerVec ph) {
00453 assert(pc == kParseElt_TableRowDone);
00454 CALL0(AddTableRowDone);
00455 }
00456
00457 void ParseElt::Process_Form(int pc, ReposReader* rr, HandlerVec ph) {
00458 assert(pc == kParseElt_Form);
00459 int len;
00460 const char* p = rr->ReadCharStar(&len);
00461 CALL2(AddForm, p, len);
00462 }
00463
00464 void ParseElt::Process_FormDone(int pc, ReposReader* rr, HandlerVec ph) {
00465 assert(pc == kParseElt_FormDone);
00466 CALL0(AddFormDone);
00467 }
00468
00469 void ParseElt::Process_Select(int pc, ReposReader* rr, HandlerVec ph) {
00470 assert(pc == kParseElt_Select);
00471 int len;
00472 const char* p = rr->ReadCharStar(&len);
00473 CALL2(AddSelect, p, len);
00474 }
00475
00476 void ParseElt::Process_SelectDone(int pc, ReposReader* rr, HandlerVec ph) {
00477 assert(pc == kParseElt_SelectDone);
00478 CALL0(AddSelectDone);
00479 }
00480
00481 void ParseElt::Process_Option(int pc, ReposReader* rr, HandlerVec ph) {
00482 assert(pc == kParseElt_Option);
00483 int len;
00484 const char* p = rr->ReadCharStar(&len);
00485 CALL2(AddOption, p, len);
00486 }
00487
00488 void ParseElt::Process_OptionDone(int pc, ReposReader* rr, HandlerVec ph) {
00489 assert(pc == kParseElt_OptionDone);
00490 CALL0(AddOptionDone);
00491 }
00492
00493 void ParseElt::Process_TextArea(int pc, ReposReader* rr, HandlerVec ph) {
00494 assert(pc == kParseElt_TextArea);
00495 int len;
00496 const char* p = rr->ReadCharStar(&len);
00497 CALL2(AddTextArea, p, len);
00498 }
00499
00500 void ParseElt::Process_TextAreaDone(int pc, ReposReader* rr, HandlerVec ph) {
00501 assert(pc == kParseElt_TextAreaDone);
00502 CALL0(AddTextAreaDone);
00503 }
00504
00505 void ParseElt::Process_Input(int pc, ReposReader* rr, HandlerVec ph) {
00506 assert(pc == kParseElt_Input);
00507 int len;
00508 const char* p = rr->ReadCharStar(&len);
00509 CALL2(AddInput, p, len);
00510 }
00511
00512 void ParseElt::Process_Heading(int pc, ReposReader* rr, HandlerVec ph) {
00513 assert(pc == kParseElt_Heading);
00514 int arg1 = rr->ReadInt();
00515 CALL1(AddHeading, arg1);
00516 }
00517
00518 void ParseElt::Process_HeadingDone(int pc, ReposReader* rr, HandlerVec ph) {
00519 assert(pc == kParseElt_HeadingDone);
00520 CALL0(AddHeadingDone);
00521 }
00522
00523 void ParseElt::Process_Noframes(int pc, ReposReader* rr, HandlerVec ph) {
00524 assert(pc == kParseElt_Noframes);
00525 CALL0(AddNoframes);
00526 }
00527
00528 void ParseElt::Process_NoframesDone(int pc, ReposReader* rr, HandlerVec ph) {
00529 assert(pc == kParseElt_NoframesDone);
00530 CALL0(AddNoframesDone);
00531 }
00532
00533 void ParseElt::Process_Object(int pc, ReposReader* rr, HandlerVec ph) {
00534 assert(pc == kParseElt_Object);
00535 int len;
00536 const char* p = rr->ReadCharStar(&len);
00537 CALL2(AddObject, p, len);
00538 }
00539
00540 void ParseElt::Process_ObjectDone(int pc, ReposReader* rr, HandlerVec ph) {
00541 assert(pc == kParseElt_ObjectDone);
00542 CALL0(AddObjectDone);
00543 }
00544
00545 void ParseElt::Process_Param(int pc, ReposReader* rr, HandlerVec ph) {
00546 assert(pc == kParseElt_Param);
00547 int len;
00548 const char* p = rr->ReadCharStar(&len);
00549 CALL2(AddParam, p, len);
00550 }
00551
00552 void ParseElt::Process_Embed(int pc, ReposReader* rr, HandlerVec ph) {
00553 assert(pc == kParseElt_Embed);
00554 int len;
00555 const char* p = rr->ReadCharStar(&len);
00556 CALL2(AddEmbed, p, len);
00557 }
00558
00559 void ParseElt::Process_Head(int pc, ReposReader* rr, HandlerVec ph) {
00560 assert(pc == kParseElt_Head);
00561 int len;
00562 const char* p = rr->ReadCharStar(&len);
00563 CALL2(AddHead, p, len);
00564 }
00565
00566 void ParseElt::Process_HeadDone(int pc, ReposReader* rr, HandlerVec ph) {
00567 assert(pc == kParseElt_HeadDone);
00568 CALL0(AddHeadDone);
00569 }
00570
00571 void ParseElt::Process_SetFace(int pc, ReposReader* rr, HandlerVec ph) {
00572 assert(pc == kParseElt_SetFace);
00573 cur_face_ = rr->ReadInt();
00574 }
00575
00576 void ParseElt::Process_SetSize(int pc, ReposReader* rr, HandlerVec ph) {
00577 assert(pc == kParseElt_SetSize);
00578 cur_size_ = rr->ReadInt();
00579 }
00580
00581 void ParseElt::Process_TermOrPunct(int pc, ReposReader* rr, HandlerVec ph) {
00582 int len;
00583 const char* pt;
00584
00585 if (GET_LENGTH_FOLLOWS(pc)) {
00586 pt = rr->ReadCharStar(&len);
00587 } else {
00588 len = GET_LENGTH(pc);
00589 pt = rr->ReadCharsOnly(len);
00590 }
00591 if (IS_TERM(pc)) {
00592 CALL4(AddTerm, pt, len, cur_face_, cur_size_);
00593 } else {
00594 CALL4(AddPunctuation, pt, len, cur_face_, cur_size_);
00595 }
00596 if (GET_WHITESPACE_FOLLOWS(pc)) {
00597 CALL0(WhitespaceEndedTerm);
00598 }
00599 }