00001 /******************************************************************************* 00002 Copyright (c) 2011, Yahoo! Inc. 00003 All rights reserved. 00004 00005 Redistribution and use of this software in source and binary forms, 00006 with or without modification, are permitted provided that the following 00007 conditions are met: 00008 00009 * Redistributions of source code must retain the above 00010 copyright notice, this list of conditions and the 00011 following disclaimer. 00012 00013 * Redistributions in binary form must reproduce the above 00014 copyright notice, this list of conditions and the 00015 following disclaimer in the documentation and/or other 00016 materials provided with the distribution. 00017 00018 * Neither the name of Yahoo! Inc. nor the names of its 00019 contributors may be used to endorse or promote products 00020 derived from this software without specific prior 00021 written permission of Yahoo! Inc. 00022 00023 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 00024 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 00025 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00026 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 00035 The Initial Developer of the Original Code is Shravan Narayanamurthy. 00036 ******************************************************************************/ 00037 /* 00038 * TypeTopicCounts.h 00039 * The Main class so to say which maintains the 00040 * Word-Topic counts table (n(t,w)). This is just a wrapper 00041 * around the TopicCounts structure that gives a 00042 * sparse matrix view out of sparse vector(TopicCounts) 00043 * Also note that the words are not strings but indexes 00044 * which can be lookedup in the dictionary if needed. 00045 * The protobuf file also stores the word indexes instead 00046 * of words to save space 00047 * 00048 * It basically has 2 types of initializations: 00049 * 1. Directly from the words & Topics files (initialize_from_docs) Slow 00050 * 2. From a binary dump produced by the dump() method (initialize_from_dump) Faster 00051 * 00052 * The basic structure maintains the for each unique word 00053 * found in the corpus/dictionary the number of times 00054 * each topic has been assigned. It supports multithreaded 00055 * access with a per word locking granularity. Its designed 00056 * to do fast updates. The locks are classified into read 00057 * and write locks to minimize contention. The updates/setters 00058 * hold a write lock and block further reads and writes on the 00059 * locked word. The get_counts methods hold a read lock and allow 00060 * other reads which are way more frequent as there are multiple 00061 * sampler threads trying to read the topic counts whereas there 00062 * is only a single thread updating in the single machine case 00063 * and 2 background threads and the updater thread 00064 * in the case of multi-machine code. Since update locks are 00065 * expensive, we try to optimize updates. 00066 * 00067 * The basic access pattern is assumed to be: 00068 * Do get_counts on the reqd word. 00069 * Update the counts thread locally and 00070 * Update the shared table. 00071 * 00072 * So the basic design aim is to support multi-threaded reads and 00073 * almost single threaded updates 00074 * 00075 * This also maintains the sum of topic counts across the corpus (n(t)) 00076 * However, there is no locking involved while doing this as it is 00077 * declared as an array of tbb::atomics. 00078 * 00079 * Created on: 24 Mar, 2009 00080 * 00081 */ 00082 00083 #ifndef TYPETOPICCOUNTS_H_ 00084 #define TYPETOPICCOUNTS_H_ 00085 00086 #include <iostream> 00087 #include <exception> 00088 00089 #include <stdio.h> 00090 00091 #include "TopKList.h" 00092 #include "tbb/atomic.h" 00093 #include "TopicCounts.h" 00094 #include "WordIndexDictionary.h" 00095 00096 using namespace std; 00097 using namespace tbb; 00098 using namespace LDA; 00099 using namespace boost; 00100 00101 class TypeTopicCounts { 00102 public: 00103 /************* Init ************/ 00104 TypeTopicCounts(); 00105 TypeTopicCounts(word_t num_words, topic_t num_topics); 00106 virtual ~TypeTopicCounts(); 00107 00108 void initialize_from_docs(string wfname, string tfname); 00109 int verify_header(DocumentReader& doc_rdr); 00110 void initialize_from_string(word_t word, string& counts); 00111 bool initialize_from_dump(string fname, 00112 WordIndexDictionary* local_dict = NULL, 00113 WordIndexDictionary* global_dict = NULL, size_t offset = 0); 00114 // void initialize_from_memcached(const char* servers, int cli_id, int num_clis, WordIndexDictionary* dict, int from, int to); 00115 void initialize_from_ttc(TypeTopicCounts* ttc); 00116 static pair<int, float> estimate_fit(string fname, 00117 WordIndexDictionary* dict); 00118 static pair<int, float> estimate_fit(string fname, float used_memory, 00119 int& incoming_words); 00120 void estimate_alphas(double* alphas, double& alpha_sum); 00121 void dump(string fname); 00122 /************* Init Ends ********/ 00123 00124 /************* Getters ********/ 00125 topic_t get_counts(word_t word, topicCounts* tc); 00126 topic_t get_counts(atomic<topic_t>* tc); 00127 00128 word_t get_num_words(); 00129 topic_t get_num_topics(); 00130 word_mutex_t* get_lock(word_t word); 00131 pair<TopKList**, TopKList*> get_topic_stats(); 00132 00133 // double model_loglikelihood(); 00134 /************* Getters End********/ 00135 00136 /************* Setters ********/ 00137 void replace(word_t word, topicCounts& tc); 00138 void upd_count(word_t word, topic_t old_topic, topic_t new_topic, 00139 bool ignore_old_topic = false); 00140 void upd_count(word_t word, mapped_vec delta, string dbg = ""); 00141 /************* Setters End********/ 00142 00143 /************* Test & Debug ********/ 00144 bool equal(const TypeTopicCounts& expected); 00145 string print(word_t word); 00146 void print(); 00147 void initialize(topicCounts* wtc, atomic<topic_t>* tc, word_t word = 0); 00148 void initialize(topicCounts** wtc, atomic<topic_t>* tc); 00149 /************* Test & Debug Ends ********/ 00150 00151 friend class Ice_Synchronizer; 00152 friend class Memcached_Synchronizer; 00153 00154 protected: 00155 atomic<topic_t> *tokens_per_topic; //n(t) 00156 topic_t num_topics; // The number of topics being learnt 00157 TopKList **topic_stats, //Topic Statistics per topic 00158 *top_topics; //The hot/top topics 00159 00160 00161 void estimate_memoryn_warn(long num_elems); 00162 void clear_stats(); 00163 void init(topic_t num_topics_); 00164 void destroy(); 00165 00166 private: 00167 topicCounts **wt; //The pointer to and array of topicCounts one per word 00168 word_mutex_t *word_mutexes; //An array of locks one per word 00169 word_t num_words; //The num of unique words 00170 00171 bool validate(); 00172 }; 00173 00174 #endif /* TYPETOPICCOUNTS_H_ */