00001 /** @file sampler.cc */ 00002 /* 00003 * Copyright (C) 2002 Laird Breyer 00004 * 00005 * This program is free software; you can redistribute it and/or modify 00006 * it under the terms of the GNU General Public License as published by 00007 * the Free Software Foundation; either version 2 of the License, or 00008 * (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00018 * 00019 * Author: Laird Breyer <laird@lbreyer.com> 00020 */ 00021 00022 #ifndef _SAMPLER_H_ 00023 #define _SAMPLER_H_ 00024 //#include "document.h" 00025 #include "webnode.h" 00026 #include "linkgraph.h" 00027 #include <gsl/gsl_rng.h> 00028 00029 00030 /// Abstract base class for all samplers. 00031 /** 00032 * A sampler 00033 * populates the web link graph with occupation counts 00034 * which are interpreted as page ranking schemes. 00035 * Supported samplers must have transition probabilities 00036 * of the form P(x,y) = eps * mu(y) + (1-eps) * Q(x,y), 00037 * where the derived class overrides the Q(x,y) definition 00038 * which is contained in the QEvolveFrom() virtual function. 00039 */ 00040 class WebSampler { 00041 public: 00042 00043 WebSampler(WebLinkGraph* g) throw (exception); 00044 00045 00046 void SimulateForward(long n); 00047 void ClearAllocForward(); 00048 void SimulateAllocForward(); 00049 void IncrementAllocForward(uint32 k, int32 num); 00050 00051 void TaggedSimulateForward(long n, const uint32 *fromsetsize); 00052 /// performs a random transition, should be overridden by derived classes 00053 virtual WebNodePtr QEvolveFrom(WebNodePtr x) 00054 { return x; } 00055 /// returns an identifying string in supplied buffer 00056 virtual char *Name(char *buf) = 0; 00057 /// Clears occupation counts and run sizes 00058 void ClearCounts(); 00059 void ClearScratch(); 00060 void SetRandomSeed(unsigned long int s) { gsl_rng_set(r,s); } 00061 void PrintCounts(ostream& o); 00062 void PrintTagCounts(ostream& o); 00063 /// Number of samples produced by SimulateForward() 00064 uint32 LastRunSize() 00065 { return last_run_size; } 00066 /// Number of candidates accepted by TaggedSimulateForward() 00067 uint32 LastTaggedRunSize() 00068 { return last_tagged_run_size; } 00069 00070 protected: 00071 WebLinkGraph* graph; 00072 uint32 number_of_nodes; 00073 00074 gsl_rng* r; // for random numbers 00075 double eps; 00076 00077 int32 *allocation_list; // for SimulateForward() 00078 LeafNodePtr xleaf; 00079 00080 double probabilities[TAG_NUMBER_OF_BITS]; // for TaggedSimulateForward() 00081 long allocated[TAG_NUMBER_OF_BITS]; 00082 uint32 fromsetsize_remaining[TAG_NUMBER_OF_BITS]; 00083 00084 uint32 last_run_size; 00085 uint32 last_tagged_run_size; 00086 }; 00087 00088 00089 /// A WebSampler which implements the standard PageRank 00090 /// chain. 00091 class PageRankSampler: public WebSampler { 00092 public: 00093 PageRankSampler(WebLinkGraph* agraph); 00094 void SetParameters(double epsilon); 00095 WebNodePtr QEvolveFrom(WebNodePtr x); 00096 virtual char *Name(char *buf); 00097 00098 }; 00099 00100 /// A scratch structure used by DateBiasedPageRankSampler 00101 typedef union DBScratchStruct { 00102 00103 DBScratchStruct(ScratchStruct s) 00104 { ss = s; } 00105 ~DBScratchStruct() {} 00106 00107 struct { 00108 uint16 filled; 00109 int16 minvalue; 00110 } db; 00111 00112 float mass; 00113 00114 ScratchStruct ss; 00115 }; 00116 00117 /// A WebSampler which implements page ranking by taking 00118 /// into account the relative age of linked-to documents. 00119 class DateBiasedPageRankSampler: public WebSampler { 00120 public: 00121 DateBiasedPageRankSampler(WebLinkGraph* agraph); 00122 void SetParameters(double epsilon, double lambda, uint16 daterange); 00123 WebNodePtr QEvolveFrom(WebNodePtr x); 00124 virtual char *Name(char *buf); 00125 private: 00126 double lam; 00127 float lamhat; 00128 uint16 daterange; 00129 }; 00130 00131 typedef enum { undef, hubs, auth } ktype; 00132 00133 /// A WebSampler which implements an epsilon approximation 00134 /// of the Kleinberg hubs and authorities model. 00135 /** 00136 * The Kleinberg hubs and authorities model can be obtained by 00137 * running a Markov chain with transition probs QQ^*, where Q 00138 * follows tolinks at random, and Q^* follows fromlinks at random. 00139 * This chain is not UE, so doesn't fit into our framework. But 00140 * by choosing epsilon close to zero, we can get an approximation. 00141 * 00142 */ 00143 class TruncatedKleinbergSampler: public WebSampler { 00144 public: 00145 TruncatedKleinbergSampler(WebLinkGraph* agraph); 00146 void SetParameters(double epsilon, ktype what); 00147 WebNodePtr QEvolveFrom(WebNodePtr x); 00148 virtual char *Name(char *buf); 00149 private: 00150 ktype which; 00151 }; 00152 00153 #endif