00001 /******************************************************************************* 00002 Copyright (c) 2011, Yahoo! Inc. 00003 All rights reserved. 00004 00005 Redistribution and use of this software in source and binary forms, 00006 with or without modification, are permitted provided that the following 00007 conditions are met: 00008 00009 * Redistributions of source code must retain the above 00010 copyright notice, this list of conditions and the 00011 following disclaimer. 00012 00013 * Redistributions in binary form must reproduce the above 00014 copyright notice, this list of conditions and the 00015 following disclaimer in the documentation and/or other 00016 materials provided with the distribution. 00017 00018 * Neither the name of Yahoo! Inc. nor the names of its 00019 contributors may be used to endorse or promote products 00020 derived from this software without specific prior 00021 written permission of Yahoo! Inc. 00022 00023 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 00024 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 00025 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00026 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 00035 The Initial Developer of the Original Code is Shravan Narayanamurthy. 00036 ******************************************************************************/ 00037 /* 00038 * DocumentReader.h 00039 * 00040 * 00041 * Created on: 7 May, 2009 00042 * 00043 */ 00044 00045 #ifndef DOCUMENTREADER_H_ 00046 #define DOCUMENTREADER_H_ 00047 00048 #include <fstream> 00049 #include "google/protobuf/message.h" 00050 #include "constants.h" 00051 00052 using namespace std; 00053 using namespace LDA; 00054 00055 /** 00056 * Wrapper around protobuf messages for convenient 00057 * reading of words, topics & (word,index) pairs 00058 * from word, topic, dictionary dump files respectively. 00059 * Assumes that each msg is in a binary file in record* format 00060 * where record=(size of serialized msg,msg serialized as string) 00061 */ 00062 class DocumentReader { 00063 public: 00064 DocumentReader(string w_fname_); 00065 virtual ~DocumentReader(); 00066 00067 int read(google::protobuf::Message* msg); 00068 00069 private: 00070 00071 string w_fname; //The input file to read various msgs from 00072 00073 ifstream w_input; //The input stream to read from w_fname 00074 00075 char* c_size; //Array to store size of serialized msg 00076 char* c_msg; //Array to store the serialized msg 00077 00078 /** 00079 * The main function that reads records from input and stores them c_size & c_msg 00080 * fname is used for logging 00081 */ 00082 inline int read_sized_record_from(ifstream& input, string fname) 00083 throw (int) { 00084 if (input.read(c_size, sizeof(size_int)).eof()) { 00085 LOG_IF(FATAL,input.bad())<< "Unable to read from input file: " <<fname; 00086 throw -1; 00087 } 00088 size_int size = *(size_int*)c_size; 00089 LOG_IF(FATAL,size>=MAX_MSG_SIZE) << "Reading input from " << fname << "Message size " << size << " exceeds " << MAX_MSG_SIZE << ". Quitting..."; 00090 00091 input.read(c_msg,size); 00092 return size; 00093 } 00094 00095 /** 00096 * The base method to read into msg from inp_str 00097 * inp_fname is used for logging 00098 */ 00099 inline void read_base(ifstream& inp_str, string& inp_fname, google::protobuf::Message* msg) throw(int) { 00100 size_int size = read_sized_record_from(inp_str, inp_fname); 00101 string str_message(c_msg, size); 00102 msg->Clear(); 00103 msg->ParseFromString(str_message); 00104 } 00105 }; 00106 00107 #endif /* DOCUMENTREADER_H_ */