00001 /******************************************************************************* 00002 Copyright (c) 2011, Yahoo! Inc. 00003 All rights reserved. 00004 00005 Redistribution and use of this software in source and binary forms, 00006 with or without modification, are permitted provided that the following 00007 conditions are met: 00008 00009 * Redistributions of source code must retain the above 00010 copyright notice, this list of conditions and the 00011 following disclaimer. 00012 00013 * Redistributions in binary form must reproduce the above 00014 copyright notice, this list of conditions and the 00015 following disclaimer in the documentation and/or other 00016 materials provided with the distribution. 00017 00018 * Neither the name of Yahoo! Inc. nor the names of its 00019 contributors may be used to endorse or promote products 00020 derived from this software without specific prior 00021 written permission of Yahoo! Inc. 00022 00023 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 00024 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 00025 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00026 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 00035 The Initial Developer of the Original Code is Shravan Narayanamurthy. 00036 ******************************************************************************/ 00037 /* 00038 * types.h 00039 * Contains various type definitions used globally 00040 * throughout the code 00041 * 00042 * Created on: 25 Mar, 2009 00043 * 00044 */ 00045 00046 #ifndef TYPES_H_ 00047 #define TYPES_H_ 00048 #include <vector> 00049 00050 #include "tbb/spin_rw_mutex.h" 00051 #include "document.pb.h" 00052 #include "tbb/tick_count.h" 00053 #include <sstream> 00054 #include <boost/random/mersenne_twister.hpp> 00055 #include "glog/logging.h" 00056 #include "boost/unordered_map.hpp" 00057 #include "tbb/atomic.h" 00058 00059 #define TIME(t) tick_count t = tick_count::now() 00060 #define PRINT_TIME(t1,t2,str) cout << "Time taken to " #str << " : " << (t2-t1).seconds() << " secs" << endl 00061 00062 typedef int32_t size_int; //A type to denote the size of serialize protobuf msgs 00063 00064 typedef int32_t cnt_t; //A type for counts 00065 typedef int32_t topic_t; //A type for topics 00066 typedef int32_t word_t; //A type for words 00067 typedef uint64_t packed_t; //A type for the packed 00068 //(topic,counts) 00069 00070 static const uint16_t INIT_TC_SIZE = 8; //Initial block allocation for TopicCounts 00071 static const uint16_t SUBSEQ_ALLOCS = 4; //Subsequent allocations for TopicCounts 00072 00073 static const float MEM_LOAD_PER = 0.7; 00074 00075 typedef std::pair<int, int> bigram_key_t; 00076 00077 /*typedef struct{ 00078 float* values; 00079 int length; 00080 float sum; 00081 }Parameters;*/ 00082 00083 typedef struct { 00084 //A structure to conveniently 00085 //and portably access topics 00086 //and counts separately. We try 00087 //to avoid direct access when 00088 //speed is not a major concern 00089 topic_t top; 00090 cnt_t cnt; 00091 } choose_t; 00092 00093 typedef union { 00094 //The main structure for the storage 00095 //and access of (topic,counts) in a 00096 //packed fashion. 00097 //Example: (100,10) is stored in pkd form 00098 //using cnt_top_t ct; ct.choose.top=100; 00099 //ct.choose.cnt=10; Access is similar. 00100 //This is the only portable way of doing 00101 //it. The moment you start accessing portions 00102 //of cnt_top directly, you make assumptions 00103 //on the order of struct storage and endianness. 00104 //There are parts in sampler & TopicCounts which are 00105 //totally arch specific and are not portable 00106 //This was done to improve performance 00107 packed_t cnt_top; 00108 choose_t choose; 00109 } cnt_topic_t; 00110 00111 typedef tbb::spin_rw_mutex word_mutex_t; //A type for read,write spin lock 00112 00113 typedef struct { 00114 //Message type to store the 00115 //changes suggested by sampler 00116 word_t word; 00117 topic_t old_topic; 00118 topic_t new_topic; 00119 int old_indicator; 00120 int new_indicator; 00121 } change_elem_t; 00122 00123 typedef struct { 00124 //Message type between sampler 00125 //and updater 00126 LDA::unigram_document* doc; 00127 std::vector<change_elem_t>* change_list; 00128 bool ignore_old_topic; 00129 } update_t; 00130 00131 typedef boost::mt19937 base_generator_type; //variate generator used by RNGS 00132 00133 typedef std::pair<topic_t, float> tppair; //(topic,proportion) pair used in dumping 00134 //topic proportions to disk 00135 00136 typedef std::pair<word_t, float> wppair; //(word,proportion) pair 00137 typedef std::pair<bigram_key_t, float> bigppair; //(bigram,proportion) pair 00138 00139 typedef std::pair<int, int> id2freq_t; //(word_id,frequency) used in dictionary to compute 00140 //and maintain word frequencies 00141 00142 /** 00143 * Exception thrown when an old topic is not found 00144 * This is fatal and causes the program to stop 00145 */ 00146 class InvalidOldTopicExc: public std::exception { 00147 word_t word; 00148 topic_t old_topic; 00149 00150 public: 00151 InvalidOldTopicExc(word_t word_, topic_t old_topic_) { 00152 word = word_; 00153 old_topic = old_topic_; 00154 } 00155 00156 virtual const char* what() const throw () { 00157 std::ostringstream ret; 00158 ret << "The old topic " << old_topic 00159 << " argument is invalid for the word " << word; 00160 return ret.str().c_str(); 00161 } 00162 }; 00163 00164 typedef boost::unordered_map<topic_t, cnt_t> mapped_vec; //The type for map used in topicCounts 00165 00166 #endif /* TYPES_H_ */