Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

sampler.cc

Go to the documentation of this file.
00001 /** @file sampler.cc */
00002 /* 
00003  * Copyright (C) 2002 Laird Breyer
00004  *  
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  * 
00019  * Author:   Laird Breyer <laird@lbreyer.com>
00020  */
00021 
00022 #include "sampler.h"
00023 //#include "basictypes.h"
00024 #include <math.h>
00025 //#include <gsl/gsl_rng.h>
00026 #include <gsl/gsl_randist.h>
00027 
00028 /// among other things, initializes the random number generator
00029 WebSampler :: WebSampler(WebLinkGraph* g) throw (exception) {
00030   assert(g);
00031 
00032   graph = g;
00033   number_of_nodes = graph->size();
00034   allocation_list = new int32[number_of_nodes];
00035   memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00036 
00037   ClearCounts();
00038   ClearScratch();
00039 
00040   r = gsl_rng_alloc(gsl_rng_taus);
00041   if( !r ) {
00042     cerr << "error: couldn't initialize the random number generator" << endl;
00043     throw exception();
00044   }
00045 
00046   eps = 1.0;
00047   xleaf = NULL;
00048 }
00049 
00050 /// clears occupation counts for all WebNode objects in the graph
00051 void WebSampler :: ClearCounts() {
00052    for(WebNodeList::iterator i = graph->begin();
00053       i != graph->end(); i++ ) {
00054      (*i)->ClearOccupationCount();
00055    }
00056    last_run_size = 0;
00057    last_tagged_run_size = 0;
00058 }
00059 
00060 /// clears scratch area for all WebNode objects in the graph
00061 void WebSampler :: ClearScratch() {
00062    for(WebNodeList::iterator i = graph->begin();
00063        i != graph->end(); i++ ) {
00064      (*i)->SetScratch(0);
00065    }
00066 }
00067 
00068 /// iterates through the WebNodeList and prints
00069 /// the occupation counts for each WebNode. 
00070 /**
00071  * Also
00072  * calculates a 95% confidence interval for each count. 
00073  * That's why it's a WebSampler member function, and not
00074  * a WebLinkGraph member function.
00075  */
00076 void WebSampler :: PrintCounts(ostream& o) {
00077   char buf[1024];
00078   o << "# Sampler used: " << Name(buf) << endl;
00079   o << "# Number of samples produced: " << last_run_size << endl;
00080   o << "# node | occupation_count | 95% conf. interval" << endl;
00081   for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00082     double k = (double)(*i)->OccupationCount();
00083     double eps = 1.96 * sqrt(k * (last_run_size - k)/last_run_size);
00084     o << (*i)->ID() << " " 
00085       << (*i)->OccupationCount() << " "
00086       << max(static_cast<int>(floor(k - eps)),0) << " " 
00087       << static_cast<unsigned int>(ceil(k + eps)) << endl;
00088   }
00089 }
00090 
00091 /// same as WebSampler::PrintCounts(), but only prints 
00092 /// tagged documents.
00093 void WebSampler :: PrintTagCounts(ostream& o) {
00094   char buf[1024];
00095   o << "# Sampler used: " << Name(buf) << endl;
00096   o << "# Number of (tagged) samples produced: " << last_tagged_run_size << endl;
00097   o << "# node | occupation_count | 95% conf. interval" << endl;
00098   for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++) {
00099     if( (*i)->Tagged(0) ) {
00100       double k = (double)(*i)->OccupationCount();
00101       double eps = 1.96 * sqrt(k * (last_tagged_run_size - k)/last_tagged_run_size);
00102       o << (*i)->ID() << " " 
00103         << (*i)->OccupationCount() << " "
00104         << max(static_cast<int>(floor(k - eps)),0) << " " 
00105         << static_cast<unsigned int>(ceil(k + eps)) << endl;
00106     }
00107   }
00108 }
00109 
00110 /// Clears the list of starting places.
00111 void WebSampler :: ClearAllocForward() {
00112   assert(number_of_nodes > 0);
00113   memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00114 }
00115 
00116 /// Adds num to the starting allocation for id
00117 void WebSampler :: IncrementAllocForward(uint32 k, int32 num) {
00118   assert( static_cast<unsigned int>(k) < number_of_nodes);
00119   allocation_list[k] += num;
00120 }
00121 
00122 void WebSampler :: SimulateAllocForward() {
00123   // now for each starting place, simulate k steps
00124   // forward, where k ~ Geom(epsilon)
00125   uint32 j = 0;
00126   for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++, j++ ) {
00127     for(int k = allocation_list[j]; k > 0; k--) {
00128 
00129       WebNodePtr x = (*i);
00130       assert(x);
00131       while( (gsl_rng_uniform(r) > eps) &&
00132              x && (x->NumberOfValidToLinks() > 0)) {
00133         x = QEvolveFrom(x);
00134       }
00135 
00136       if( x ) {
00137         x->IncrementOccupationCount();
00138       } else if( xleaf ) {
00139         xleaf->IncrementOccupationCount();
00140       }
00141 
00142     }
00143   }
00144 }
00145 
00146 /// samples n random variables from the page ranking distribution
00147 /**
00148  * Each of the sampled variables is
00149  * generated by a uniform on the web link graph, followed
00150  * by a geometric number (mean eps) of transitions using
00151  * the virtual function QEvolveFrom(). The final WebNodePtr value is 
00152  * recorded in the WebNode's occupation_number data member.
00153  */
00154 void WebSampler :: SimulateForward(long n) {
00155   assert(graph);
00156   assert(allocation_list);
00157   assert( n > 0 );
00158 
00159   assert(number_of_nodes > 0);
00160 
00161   ClearAllocForward();
00162 
00163   // generate n starting places
00164   for(long i = 0; i < n; i++)
00165     allocation_list[(int32)gsl_rng_uniform_int(r, number_of_nodes)]++;
00166 
00167   SimulateAllocForward();
00168 
00169   last_run_size += n;
00170 }
00171 
00172 /// samples from part of the page ranking distribution by rejection 
00173 /// sampling. 
00174 /** 
00175  * A total of n candidates are generated, not all
00176  * of which are eventually accepted. 
00177  * Only candidates which are accepted show up in the relevant
00178  * WebNode's occupation_count. 
00179  * Before calling this, 
00180  * make sure to call Grapher::BuildFromSets(), whithout which
00181  * this algorithm will bomb. 
00182  */
00183 void WebSampler :: TaggedSimulateForward(long n, const uint32 *fromsetsize) {
00184   assert(graph);
00185   assert(allocation_list);
00186   assert( n > 0 );
00187 
00188   assert(number_of_nodes > 0);
00189 
00190   // figure out how many must start in a FromSet, and 
00191   // how many start anywhere in the graph
00192   long num_less_than_K = 0;
00193   long num_greater_than_K = 0;
00194   double threshold = pow(1 - eps, TAG_NUMBER_OF_BITS);
00195   for(long i = 0; i < n; i++) {
00196     if( gsl_rng_uniform(r) > threshold ) {
00197       num_less_than_K++;
00198     } else {
00199       num_greater_than_K++;
00200     }
00201   }
00202   // deal with chains whose number of transitions exceeds K
00203   // this code is nealy identical to SimulateForward
00204   {
00205     cerr << "info: tagged simulation uses " << num_greater_than_K << " global candidates (" 
00206          << (100 * num_greater_than_K)/n << "%)"<< endl;
00207     long num_accepted = 0;
00208     // generate n starting places
00209     memset(allocation_list, 0, number_of_nodes * sizeof(int32));
00210     for(long i = 0; i < num_greater_than_K; i++)
00211       allocation_list[(int32)gsl_rng_uniform_int(r, number_of_nodes)]++;
00212 
00213     // now for each starting place, simulate K+s steps
00214     // forward, where s ~ Geom(epsilon)
00215     uint32 j = 0;
00216     for(WebNodeList::iterator i = graph->begin();
00217         i != graph->end(); i++, j++ ) {
00218       for(int k = allocation_list[j]; k > 0; k--) {
00219         
00220         WebNodePtr x = (*i);
00221         assert(x);
00222         // step through K mandatory transitions
00223         for(int t = 0; (t < TAG_NUMBER_OF_BITS) && (x->NumberOfValidToLinks() > 0); t++) {
00224           x = QEvolveFrom(x);
00225         }
00226         // now generate the others
00227         while( (gsl_rng_uniform(r) > eps) && (x->NumberOfValidToLinks() > 0)) {
00228           x = QEvolveFrom(x);
00229         }
00230         // accept only if the result is tagged
00231         if( x->Tagged(0) ) {
00232           x->IncrementOccupationCount();
00233           last_tagged_run_size++;
00234           num_accepted++;
00235         }
00236       }
00237     }
00238     cerr << "info: tagged simulation has produced " << num_accepted << " global samples" << endl;
00239   }
00240 
00241   // deal with chains whose number of transitions is less than K
00242   {
00243     cerr << "info: tagged simulation uses " << num_less_than_K << " local candidates" << endl;
00244     long num_accepted = 0;
00245     /// generate num_less_than_K starting points
00246 
00247     for(int k = 0; k < TAG_NUMBER_OF_BITS; k++) {
00248       probabilities[k] = static_cast<double>(fromsetsize[k]) * eps * pow(1 - eps, k);
00249       allocated[k] = 0;
00250       fromsetsize_remaining[k] = fromsetsize[k]; // will be decremented below
00251     }
00252     gsl_ran_discrete_t *grd = gsl_ran_discrete_preproc(TAG_NUMBER_OF_BITS, probabilities);
00253     for(long l = 0; l < num_less_than_K; l++ ) {
00254       // allocated[k] contains the number of starting poinst
00255       // allocated to the kth FromSet
00256       allocated[gsl_ran_discrete(r,grd)]++;
00257     }
00258     gsl_ran_discrete_free(grd);
00259 
00260     // now step through the web graph
00261     for(WebNodeList::iterator i = graph->begin(); i != graph->end(); i++ ) {
00262       assert(*i);
00263       for(int k = 0; k < TAG_NUMBER_OF_BITS; k++) {
00264         if( (*i)->Tagged(k) ) {
00265           // this node belongs to the kth FromSet
00266           fromsetsize_remaining[k]--;
00267           assert(fromsetsize_remaining[k] >= 0);
00268 
00269           // see how many times it should be chosen as a starting point
00270           double p = 1.0/static_cast<double>(fromsetsize_remaining[k]);
00271           unsigned int num_starts = gsl_ran_binomial(r, p, allocated[k]);
00272 
00273           // now that we have the number of starting chains, ie
00274           // num_starts ~ Binomial(allocated[k], p),
00275           // we iterate each chain exactly k times
00276           for(uint32 c = 0; c < num_starts; c++) {
00277             WebNodePtr x = (*i);
00278             int s = k;
00279             while( (s > 0) && (x->Tagged(s)) ) {
00280               // if x fails to be tagged(s), then 
00281               // there is no hope of reaching the tagged(0) set
00282               x = QEvolveFrom(x);
00283               s--;
00284             }
00285             // accept only if x belongs to tagged set
00286             if( (s == 0) && (x->Tagged(0)) ) { // test s==0 is probably unnecessary
00287               x->IncrementOccupationCount();
00288               last_tagged_run_size++;
00289               num_accepted++;
00290             }
00291           }
00292 
00293           // now that we've run the chains, remove them
00294           // from the total allocated number
00295           allocated[k] -= num_starts; 
00296           assert(allocated[k] >= 0);
00297         }
00298       }
00299     }
00300     cerr << "info: tagged simulation has produced " << num_accepted << " local samples" << endl;
00301   }
00302 }
00303 
00304 //-------------------------------------//
00305 
00306 PageRankSampler :: PageRankSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00307   eps = 0.5;
00308 }
00309 
00310 void PageRankSampler :: SetParameters(double epsilon) {
00311   assert(graph);
00312   assert( (epsilon > 0) && (epsilon < 1));
00313   eps = epsilon;
00314   ClearCounts();
00315 }
00316 
00317 /// Returns the effect of a single random transition.
00318 /// If result is NULL, this means the new state is a LeafNode
00319 inline WebNodePtr PageRankSampler :: QEvolveFrom(WebNodePtr x) {
00320 
00321   int nx = x->NumberOfValidToLinks() + x->NumberOfLeafLinks();
00322   if( nx == 0 ) {
00323     return x;
00324   }
00325 
00326   register int where = 
00327     gsl_rng_uniform_int(r, nx);
00328   if( where < x->NumberOfValidToLinks() ) {
00329     return x->ValidToLink(where);
00330   } else {
00331     xleaf = x->ValidLeafLinkDirectly(where); 
00332     return NULL;
00333   }
00334 
00335 }
00336 
00337 char *PageRankSampler :: Name(char *buf) {
00338   sprintf(buf, "PageRank, epsilon = %f", eps);
00339   return buf;
00340 }
00341 
00342 //-------------------------------------//
00343 
00344 DateBiasedPageRankSampler :: DateBiasedPageRankSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00345   eps = 0.5;
00346   lam = 0.01;
00347   daterange = kuint16max;
00348   lamhat = lam/daterange;
00349 
00350   // this is for safety, don't remove
00351   assert(sizeof(DBScratchStruct) == sizeof(ScratchStruct));
00352 }
00353 
00354 void DateBiasedPageRankSampler :: SetParameters(double epsilon, double lambda, uint16 dr) {
00355   assert(graph);
00356   assert( (epsilon > 0) && (epsilon < 1));
00357   eps = epsilon;
00358   lam = lambda;
00359   assert(dr > 0);
00360   daterange = dr;
00361   lamhat = lam/daterange;
00362   ClearCounts();
00363 }
00364 
00365 
00366 /// Returns the effect of a single random transition.
00367 /// If result is NULL, this means the new state is a LeafNode
00368 inline WebNodePtr DateBiasedPageRankSampler :: QEvolveFrom(WebNodePtr x) {
00369 
00370   int nx = x->NumberOfValidToLinks() + x->NumberOfLeafLinks();
00371   if( nx == 0 ) {
00372     return x;
00373   }
00374 
00375   register DBScratchStruct scratch(x->Scratch());
00376   // work out total mass if needed
00377   if( !scratch.ss ) {
00378     scratch.mass = 0.0;
00379     for(register int t = 0; t < x->NumberOfValidToLinks(); t++) {
00380       scratch.mass += exp(-lamhat * max(x->Date() - x->ValidToLink(t)->Date(), 0));
00381     }
00382     for(register int t = 0; t < x->NumberOfLeafLinks(); t++) {
00383       scratch.mass += exp(-lamhat * max(x->Date() - x->ValidLeafLink(t)->Date(), 0));
00384     }
00385     x->SetScratch(scratch.ss);
00386   }
00387 
00388   register float tmass = 0.0;
00389 
00390   for(register int t = 0; t < x->NumberOfValidToLinks(); t++) {
00391     register float xi = gsl_rng_uniform(r);
00392     register float p = exp(-lamhat * max(x->Date() - x->ValidToLink(t)->Date(),0) );
00393     if( p + tmass * xi >= scratch.mass * xi ) {
00394       return x->ValidToLink(t);
00395     } else {
00396       tmass += p;
00397     }
00398   }
00399 
00400   for(register int t = 0; t < x->NumberOfLeafLinks(); t++) {
00401     register float xi = gsl_rng_uniform(r);
00402     register float p = exp(-lamhat * max(x->Date() - x->ValidLeafLink(t)->Date(),0) );
00403     if( p + tmass * xi >= scratch.mass * xi ) {
00404       xleaf = x->ValidLeafLink(t);
00405       return NULL;
00406     } else {
00407        tmass += p;
00408     }
00409   }
00410 
00411   // if we're here, there was a precision problem
00412   // log it and choose a link at random like in PageRank
00413   cerr << "warning: precision difficulties in DateBiasedPageRankSampler::QEvolveFrom()" << endl;
00414 
00415   register int where = 
00416     gsl_rng_uniform_int(r, nx);
00417   if( where < x->NumberOfValidToLinks() ) {
00418     return x->ValidToLink(where);
00419   } else {
00420     xleaf = x->ValidLeafLinkDirectly(where); 
00421     return NULL;
00422   }
00423 
00424 }
00425 
00426 
00427 char *DateBiasedPageRankSampler :: Name(char *buf) {
00428   sprintf(buf, "DateBiasedPageRank, epsilon = %f, lambda = %f", eps, lam);
00429   return buf;
00430 }
00431 
00432 ///
00433 /// old implementation
00434 ///
00435 // /// Returns the effect of a single random transition.
00436 // /// If result is NULL, this means the new state is a LeafNode
00437 // inline WebNodePtr DateBiasedPageRankSampler :: QEvolveFrom(WebNodePtr x) {
00438 
00439 //   int nx = x->NumberOfValidToLinks() + x->NumberOfLeafLinks();
00440 //   if( nx == 0 ) {
00441 //     return x;
00442 //   }
00443 
00444 //   register DBScratchStruct scratch(x->Scratch());
00445 //   // work out normalising constant if needed
00446 //   if( !scratch.db.filled ) {
00447 //     register uint16 maxdate = 0;
00448 //     for(register int t = 0; t < x->NumberOfValidToLinks(); t++) {
00449 //       maxdate = max(maxdate, x->ValidToLink(t)->Date());
00450 //     }
00451 //     for(register int t = 0; t < x->NumberOfLeafLinks(); t++) {
00452 //       maxdate = max(maxdate, x->ValidLeafLink(t)->Date());
00453 //     }
00454 //     scratch.db.minvalue = x->Date() - maxdate;
00455 //     scratch.db.filled = 1;
00456 //     x->SetScratch(scratch.ss);
00457 //   }
00458 
00459 //   // rejection sampler
00460 //   register int where = gsl_rng_uniform_int(r, nx);
00461 
00462 //   while(1) {
00463 //     register uint16 d = (where < x->NumberOfValidToLinks()) ? 
00464 //       x->ValidToLink(where)->Date() : 
00465 //       x->ValidLeafLinkDirectly(where)->Date();
00466 //     if( lam * (x->Date() - d - scratch.db.minvalue) 
00467 //      <= gsl_ran_exponential(r,1.0) ) {
00468 //       break;
00469 //     } else {
00470 //       where = gsl_rng_uniform_int(r, nx);
00471 //     }
00472 //   }
00473     
00474 //   if( where < x->NumberOfValidToLinks() ) {  
00475 //     return x->ValidToLink(where);
00476 //   } else {
00477 //     xleaf = x->ValidLeafLinkDirectly(where);
00478 //     return NULL;
00479 //   }
00480 // }
00481 
00482 //-------------------------------------//
00483 
00484 TruncatedKleinbergSampler :: TruncatedKleinbergSampler(WebLinkGraph* agraph): WebSampler(agraph) {
00485   eps = 0.5;
00486 }
00487 
00488 void TruncatedKleinbergSampler :: SetParameters(double epsilon, ktype what) {
00489   assert(graph);
00490   assert( (epsilon > 0) && (epsilon < 1));
00491   eps = epsilon;
00492   which = what;
00493   ClearCounts();
00494 }
00495 
00496 /// Returns the effect of a single random transition.
00497 /// Not designed to work with leaf nodes
00498 inline WebNodePtr TruncatedKleinbergSampler :: QEvolveFrom(WebNodePtr x) {
00499 
00500   int nx;
00501 
00502   switch(which) {
00503   case hubs:
00504     nx = x->NumberOfValidToLinks();
00505     if( nx > 0 ) {
00506       // forward
00507       register int where = gsl_rng_uniform_int(r, nx);
00508       x = x->ValidToLink(where);
00509       // backward
00510       nx = x->NumberOfValidFromLinks();
00511       if( nx > 0 ) {
00512         where = gsl_rng_uniform_int(r, nx);
00513         return x->ValidFromLink(where);
00514       } else {
00515         return x;
00516       }
00517     } else {
00518       return x;
00519     }
00520     break;
00521   case auth:
00522     nx = x->NumberOfValidFromLinks();
00523     if( nx > 0 ) {
00524       // backward
00525       register int where = gsl_rng_uniform_int(r, nx);
00526       x = x->ValidFromLink(where);
00527       // forward
00528       nx = x->NumberOfValidToLinks();
00529       if( nx > 0 ) {
00530         where = gsl_rng_uniform_int(r, nx);
00531         return x->ValidToLink(where);
00532       } else {
00533         return x;
00534       }
00535     } else {
00536       return x;
00537     }
00538     break;
00539   default:
00540     return x;
00541   }
00542 
00543 }
00544 
00545 char *TruncatedKleinbergSampler :: Name(char *buf) {
00546   sprintf(buf, "Truncated Kleinberg, epsilon = %f %s", eps,
00547           (which == hubs) ? "(hubs)" : "(auth)");
00548   return buf;
00549 }

Generated on Wed May 29 11:37:15 2002 for MarkovPR by doxygen1.2.15