00001 /******************************************************************************* 00002 Copyright (c) 2011, Yahoo! Inc. 00003 All rights reserved. 00004 00005 Redistribution and use of this software in source and binary forms, 00006 with or without modification, are permitted provided that the following 00007 conditions are met: 00008 00009 * Redistributions of source code must retain the above 00010 copyright notice, this list of conditions and the 00011 following disclaimer. 00012 00013 * Redistributions in binary form must reproduce the above 00014 copyright notice, this list of conditions and the 00015 following disclaimer in the documentation and/or other 00016 materials provided with the distribution. 00017 00018 * Neither the name of Yahoo! Inc. nor the names of its 00019 contributors may be used to endorse or promote products 00020 derived from this software without specific prior 00021 written permission of Yahoo! Inc. 00022 00023 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 00024 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 00025 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 00026 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 00035 The Initial Developer of the Original Code is Shravan Narayanamurthy. 00036 ******************************************************************************/ 00037 /* 00038 * Data_Formatter.h 00039 * 00040 * 00041 * Created on: 11-Jan-2011 00042 * 00043 */ 00044 00045 #ifndef DATA_FORMATTER_H_ 00046 #define DATA_FORMATTER_H_ 00047 00048 #include "WordIndexDictionary.h" 00049 00050 //!An interface for formatter objects 00051 /** A formatter is an object that 00052 * converts raw text corpus into 00053 * binary so that its disk footprint 00054 * is low and there is no parsing 00055 * involved while reading it back 00056 */ 00057 class Data_Formatter { 00058 public: 00059 //!Perform the actual formatting 00060 virtual void format() = 0; 00061 00062 //!Return the dictionary being used by the formatter 00063 virtual WordIndexDictionary& get_dictionary() = 0; 00064 00065 //!The number of documents formatted 00066 virtual int get_num_docs() = 0; 00067 00068 //!The total number of words found 00069 virtual int get_total_num_words() = 0; 00070 }; 00071 00072 #endif /* DATA_FORMATTER_H_ */