00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "handler-parser.h"
00023 #include "graphbuilder.h"
00024 #include "content-type.h"
00025
00026
00027 extern int strincmp(const char *s1, const char *s2, int s2len);
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 class GraphParseHandler : public ParseHandler {
00039 public:
00040
00041 GraphParseHandler(GraphBuilder *graphbuilder);
00042
00043 virtual void NewDocument(const Document* doc);
00044 virtual void AddHeader(const char* key, int keylen,
00045 const char* value, int valuelen);
00046 virtual void AddAnchor(const char* href, int hreflen);
00047
00048 private:
00049 GraphBuilder * gb;
00050 };
00051
00052 ParseHandler* MakeGraphHandler(GraphBuilder *graphbuilder) {
00053 return new GraphParseHandler(graphbuilder);
00054 }
00055
00056 GraphParseHandler :: GraphParseHandler(GraphBuilder *graphbuilder) {
00057 assert( graphbuilder );
00058 gb = graphbuilder;
00059 }
00060
00061 void GraphParseHandler :: NewDocument(const Document* doc) {
00062
00063
00064
00065 if( strlen(doc->url_after_redirects()) > 0 ) {
00066 gb->NodeSetURL(doc->url_after_redirects(), doc->url());
00067
00068 } else {
00069 gb->NodeSetURL(doc->url(), NULL);
00070 }
00071 }
00072
00073
00074 void GraphParseHandler :: AddHeader(const char* key, int keylen,
00075 const char* value, int valuelen) {
00076
00077
00078 if( (strncmp(key,"Date",4) == 0) ||
00079 (strncmp(key,"Last-Modified",13) == 0) ||
00080 (strncmp(key,"X-Google-Crawl-Date",19) == 0) ) {
00081
00082
00083
00084
00085 assert( valuelen < 50 );
00086 char mydate[50];
00087 char * p;
00088 int d=0, m=0, y=0;
00089 bool badformat = false;
00090 int64 jd;
00091 char * daytable = "janfebmaraprmayjunjulaugsepoctnovdec";
00092
00093
00094 strncpy(mydate,value,valuelen);
00095 mydate[valuelen] = 0;
00096
00097
00098 for( p = mydate; *p; p++) {
00099 if( isdigit(p[0]) && isdigit(p[1]) && (p[2] == ':') &&
00100 isdigit(p[3]) && isdigit(p[4]) && (p[5] == ':') &&
00101 isdigit(p[6]) && isdigit(p[7]) && (!isdigit(p[8])) ) {
00102 memset(p, ' ', 8);
00103 break;
00104 }
00105 }
00106
00107
00108 if( !strtok(mydate, " -") ) {
00109 badformat = true;
00110 }
00111
00112 for(int daymon=0; daymon < 2; daymon++) {
00113 p = strtok(NULL, " -");
00114 if( p && isdigit(p[0]) ) {
00115 d = atoi(p);
00116 if( (d > 31) || (d < 1) ) {
00117 badformat = true;
00118 }
00119 } else if( p && isalnum(p[0]) ) {
00120 m = 0;
00121 for(int k = 0; k < 12; k++) {
00122 int q = 3*k;
00123 if( (daytable[q] == tolower(p[0])) &&
00124 (daytable[q+1] == tolower(p[1])) &&
00125 (daytable[q+2] == tolower(p[2])) ) {
00126 m = k+1;
00127 break;
00128 }
00129 }
00130 if( m == 0 ) {
00131 badformat = true;
00132 }
00133 } else {
00134 badformat = true;
00135 }
00136 }
00137
00138 p = strtok(NULL, " -");
00139 if( p ) {
00140 y = atoi(p);
00141 if( y < 100 ) {
00142 y = (y < 50) ? (y + 2000) : (y + 1900);
00143 } else if( y < 1994) {
00144 if( y == 100 ) {
00145 y = 2000;
00146 } else if( y == 101 ) {
00147 y = 2001;
00148 } else {
00149 badformat = true;
00150 }
00151 } else if( y > 2010) {
00152 badformat = true;
00153 }
00154 } else {
00155 badformat = true;
00156 }
00157
00158 if( !badformat ) {
00159
00160 jd = ( 1461 * ( y + 4800 + ( m - 14 ) / 12 ) ) / 4 +
00161 ( 367 * ( m - 2 - 12 * ( ( m - 14 ) / 12 ) ) ) / 12 -
00162 ( 3 * ( ( y + 4900 + ( m - 14 ) / 12 ) / 100 ) ) / 4 +
00163 d + 2367925;
00164 jd -= 4845000;
00165
00166 } else {
00167 jd = 0;
00168 strncpy(mydate,value,valuelen);
00169 cerr << "warning: cannot parse date, setting to zero [" << mydate << "]" << endl;
00170 }
00171
00172
00173 assert( jd <= kuint16max );
00174 gb->NodeSetDate((uint16)jd);
00175 }
00176 }
00177
00178 void GraphParseHandler :: AddAnchor(const char* href, int hreflen) {
00179
00180
00181
00182
00183
00184 if( hreflen <= 0 ) {
00185 return;
00186 } else if( href[0] == '#' ) {
00187 return;
00188 } else if( (hreflen > 6) && (strncasecmp(href, "ftp://", 6) == 0) ) {
00189
00190 return;
00191 } else if( (hreflen > 9) && (strncasecmp(href, "gopher://", 9) == 0) ) {
00192
00193 return;
00194 } else if( (hreflen > 7) && (strncasecmp(href, "mailto:", 7) == 0) ) {
00195
00196 return;
00197 } else if( (hreflen > 5) && (strncasecmp(href, "nntp:", 5) == 0) ) {
00198
00199 return;
00200 } else if( (hreflen > 5) && (strncasecmp(href, "news:", 5) == 0) ) {
00201
00202 return;
00203 } else if( (hreflen > 7) && (strncasecmp(href, "telnet:", 7) == 0) ) {
00204
00205 return;
00206 } else if( (hreflen > 7) && (strncasecmp(href, "tn3270:", 7) == 0) ) {
00207
00208 return;
00209 } else if( (hreflen > 5) && (strncasecmp(href, "wais:", 5) == 0) ) {
00210
00211 return;
00212 } else if( (hreflen > 5) && (strncasecmp(href, "file:", 5) == 0) ) {
00213
00214
00215 char *p = strstr(href,"htm");
00216 if( p && ((p - href) < hreflen) ) {
00217
00218 } else {
00219 return;
00220 }
00221 } else if( (hreflen > 11) && (strncasecmp(href, "javascript:", 11) == 0) ) {
00222
00223 return;
00224 } else if( (hreflen > 9) && (strncasecmp(href, "prospero:", 11) == 0) ) {
00225
00226 return;
00227 }
00228
00229 ContentType ctype;
00230 const char *url = gb->FormatURL(href, hreflen, &ctype);
00231
00232
00233
00234 switch (ctype) {
00235 case CONTENT_APPLICATION_POSTSCRIPT:
00236 case CONTENT_APPLICATION_PDF:
00237 case CONTENT_APPLICATION_MSWORD:
00238 case CONTENT_TEXT_RTF:
00239 case CONTENT_APPLICATION_MS_POWERPOINT:
00240 case CONTENT_APPLICATION_XGZIP:
00241 case CONTENT_IMAGE:
00242 case CONTENT_TEXT_PLAIN:
00243 case CONTENT_AUDIO_MP3:
00244 return;
00245 break;
00246 case CONTENT_GOOGLE_OTHER:
00247 case CONTENT_TEXT_HTML:
00248 default:
00249 break;
00250 }
00251
00252
00253 if( strcmp(url, gb->NodeGetURL()) != 0 ) {
00254 gb->TrieInsertLinkURL(url);
00255 if(gb->flags.show_links) {
00256 cout << " * " << url << endl;
00257 }
00258 }
00259 }
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270