00001 /******************************************************************************* 00002 Copyright (c) 2011, Yahoo! Inc. 00003 All rights reserved. 00004 00005 Redistribution and use of this software in source and binary forms, 00006 with or without modification, are permitted provided that the following 00007 conditions are met: 00008 00009 * Redistributions of source code must retain the above 00010 copyright notice, this list of conditions and the 00011 following disclaimer. 00012 00013 * Redistributions in binary form must reproduce the above 00014 copyright notice, this list of conditions and the 00015 following disclaimer in the documentation and/or other 00016 materials provided with the distribution. 00017 00018 * Neither the name of Yahoo! Inc. nor the names of its 00019 contributors may be used to endorse or promote products 00020 derived from this software without specific prior 00021 written permission of Yahoo! Inc. 00022 00023 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 00024 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 00025 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00026 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 00035 The Initial Developer of the Original Code is Shravan Narayanamurthy. 00036 ******************************************************************************/ 00037 /* 00038 * WordIndexDictionary.h 00039 * 00040 * 00041 * 00042 * Created on: 7 May, 2009 00043 * 00044 */ 00045 00046 #ifndef WORDINDEXDICTIONARY_H_ 00047 #define WORDINDEXDICTIONARY_H_ 00048 00049 #include "boost/unordered_map.hpp" 00050 #include "types.h" 00051 #include "DocumentReader.h" 00052 #include <climits> 00053 00054 using namespace std; 00055 using namespace boost; 00056 //!A two way dictionary of words to indices 00057 /** 00058 * Provides a two way dictionary mapping 00059 * words as strings to a unique int index 00060 * and vice versa. The hashtable implementation 00061 * of boost/unordered_map is used. 00062 * 00063 */ 00064 class WordIndexDictionary { 00065 public: 00066 WordIndexDictionary(); 00067 virtual ~WordIndexDictionary(); 00068 int get_index(string word); 00069 string get_word(int index); 00070 int insert_word(string word); 00071 int get_num_words() const; 00072 void print(); 00073 bool match_word_index(); 00074 void dump(string fname); 00075 void initialize_from_dict(WordIndexDictionary* dict, bool sort = false); 00076 void initialize_from_dump(string fname, int num_words = INT_MAX, bool sort = 00077 false); 00078 void initialize_from_dumps(string prefix, int dumps); 00079 size_t size(); 00080 int get_prev_index(int new_id); 00081 int get_freq(int index); 00082 00083 vector<id2freq_t> frequencies; 00084 00085 private: 00086 typedef unordered_map<string, int> wimap; 00087 unordered_map<string, int> word_ind_map; 00088 unordered_map<int, string> ind_word_map; 00089 00090 wimap::iterator wi_end; 00091 int current_index; 00092 int insert_word(string word, int index_); 00093 int verify_header(DocumentReader & doc_rdr); 00094 string get_suffix(int n); 00095 void sort_on_freq(); 00096 string suffices[100]; 00097 }; 00098 00099 #endif /* WORDINDEXDICTIONARY_H_ */