00001 /** @file webnode.h */ 00002 /* 00003 * Copyright (C) 2002 Laird Breyer 00004 * 00005 * This program is free software; you can redistribute it and/or modify 00006 * it under the terms of the GNU General Public License as published by 00007 * the Free Software Foundation; either version 2 of the License, or 00008 * (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00018 * 00019 * Author: Laird Breyer <laird@lbreyer.com> 00020 */ 00021 #ifndef _WEB_NODE_H_ 00022 #define _WEB_NODE_H_ 00023 #include "mempool.h" 00024 #include "simplehash.h" 00025 #include "leafnode.h" 00026 #include <stdexcept> 00027 #include <set> 00028 00029 /// Encapsulates a web document. 00030 class WebNode; // defined below 00031 /// All WebNodes are allocated on the heap as WebNodePtrs. 00032 typedef WebNode* WebNodePtr; 00033 00034 /// A pointer to a web document/URL. 00035 /** 00036 * During the construction of the web graph, 00037 * document links are of type ptrdiff_t, but after 00038 * the WebLinkGraph is undocked, they are of type WebNodePtr 00039 */ 00040 struct LinkStruct { 00041 union { 00042 ptrdiff_t pointer_diff; 00043 WebNodePtr webnode_ptr; 00044 LeafNodePtr leafnode_ptr; 00045 }; 00046 }; 00047 00048 /// A memory management wrapper around LinkStruct. 00049 typedef MemPoolObject<LinkStruct> Link; 00050 00051 /// A scratch structure used by samplers. 00052 /// Each sampler should cast it into something useful 00053 typedef uint32 ScratchStruct; 00054 00055 #define Mb 1048576L 00056 #define WEBNODE_MEMPOOL_DELTA ((1 * Mb)/sizeof(WebNode)) 00057 #define LINK_MEMPOOL_DELTA ((1 * Mb)/sizeof(Link)) 00058 00059 #define TAG_NUMBER_OF_BITS 16 00060 00061 /// Contains all the information about a web document. 00062 struct WebNodeStruct { 00063 // the ordering here is important due to alignment issues 00064 Link *tolinks; 00065 Link *fromlinks; 00066 uint32 id; 00067 uint32 occupation_count; 00068 uint16 date; 00069 uint16 num_tolinks; 00070 uint16 num_fromlinks; 00071 uint16 num_valid_tolinks; 00072 uint16 num_leaflinks; 00073 uint16 tag; 00074 ScratchStruct scratch; 00075 }; 00076 00077 /// Used by GraphBuilder to store uniquely the anchor links 00078 /// and then insert them into the WebNode. 00079 typedef set<ptrdiff_t> RawLinkSet; //,CharPtrComparisonFunction> RawLinkSet; 00080 00081 /// Encapsulates a web document. 00082 /** 00083 * Every web document read by the ripper is represented by 00084 * a WebNode. The construction of a WebNode is complicated, 00085 * and is done by GraphBuilder, which also links the nodes 00086 * into a WebLinkGraph. 00087 * All the data members are defined as a WebNodeStruct, WebNode 00088 * is really just a wrapper for WebNodeStruct to handle custom 00089 * memory management. 00090 * The class inherits memory management from MemoryPooled<T>. 00091 */ 00092 class WebNode: public MemoryPooled<WebNodeStruct> { 00093 public: 00094 00095 WebNode(uint32 idno); 00096 void InsertRawLinks(RawLinkSet *s); 00097 void NormalizeRawLinks(SimpleHashTable<WebNodePtr> *h); 00098 size_t RealSize(); 00099 // inline functions 00100 int NumberOfValidToLinks() 00101 { return data.num_valid_tolinks;} 00102 int NumberOfDanglingToLinks() 00103 { return (data.num_tolinks - data.num_valid_tolinks - data.num_leaflinks);} 00104 int NumberOfValidFromLinks() 00105 { return data.num_fromlinks;} 00106 int NumberOfLeafLinks() 00107 { return data.num_leaflinks;} 00108 00109 void IncrementNumberOfFromLinks() 00110 { data.num_fromlinks++; } 00111 00112 void AppendFromLink(WebNodePtr anothernode) throw (overflow_error); 00113 00114 void UpdateLeafLinks(SimpleLeafNodePtrHashTable *leaftable); 00115 00116 void SetDate(uint16 adate); 00117 00118 WebNodePtr ValidToLink(int k) 00119 { 00120 assert(k < data.num_valid_tolinks); 00121 assert(data.tolinks[k].data.webnode_ptr); 00122 return data.tolinks[k].data.webnode_ptr; 00123 } 00124 WebNodePtr ValidFromLink(int k) 00125 { 00126 assert(data.fromlinks[k].data.webnode_ptr); 00127 return data.fromlinks[k].data.webnode_ptr; 00128 } 00129 LeafNodePtr ValidLeafLink(int k) 00130 { 00131 assert(k < data.num_leaflinks); 00132 assert(data.tolinks[data.num_valid_tolinks + k].data.leafnode_ptr); 00133 return data.tolinks[data.num_valid_tolinks + k].data.leafnode_ptr; 00134 } 00135 LeafNodePtr ValidLeafLinkDirectly(int k) ///< Same as ValidLeafLink but saves a +/- in -O3 00136 { 00137 assert(k >= data.num_valid_tolinks); 00138 assert(k < data.num_valid_tolinks + data.num_leaflinks); 00139 assert(data.tolinks[k].data.leafnode_ptr); 00140 return data.tolinks[k].data.leafnode_ptr; 00141 } 00142 00143 uint32 ID() 00144 { return data.id; } 00145 uint16 Date() 00146 { return data.date; } 00147 00148 void ClearTag() 00149 { data.tag = 0; } 00150 void SetTag(int k) 00151 { 00152 assert(k < TAG_NUMBER_OF_BITS); 00153 data.tag |= (1<<k); // sets the kth bit 00154 } 00155 bool Tagged(int k) 00156 { 00157 return ((data.tag & (1<<k)) != 0); 00158 } 00159 00160 void ClearOccupationCount() 00161 { 00162 data.occupation_count = 0; 00163 data.scratch = 0; 00164 } 00165 uint32 OccupationCount() 00166 { return data.occupation_count; } 00167 void IncrementOccupationCount() 00168 { data.occupation_count++; } 00169 void IncrementOccupationCount(int c) 00170 { data.occupation_count += c; } 00171 00172 ScratchStruct Scratch() 00173 { return data.scratch; } 00174 void SetScratch(ScratchStruct ascratch) 00175 { data.scratch = ascratch; } 00176 00177 private: 00178 static MemPool<LinkStruct> global_link_pool; 00179 }; 00180 00181 #endif