00001 /* -*- C++ -*- 00002 00003 textIR - A fast text document retrieval engine 00004 00005 Copyright (C) 2005-2009 Laurence Park 00006 00007 This program is free software: you can redistribute it and/or modify 00008 it under the terms of the GNU General Public License as published by 00009 the Free Software Foundation, either version 3 of the License, or 00010 (at your option) any later version. 00011 00012 This program is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 GNU General Public License for more details. 00016 00017 You should have received a copy of the GNU General Public License 00018 along with this program. If not, see <http://www.gnu.org/licenses/>. 00019 00020 File information: 00021 $Header: /home/staff/lapark/cvsroot/web_search/textIR/src/WordSubIndex.h,v 1.10 2009/10/08 06:41:31 lapark Exp $ 00022 */ 00023 00024 #ifndef WORDSUBINDEX_H 00025 #define WORDSUBINDEX_H 00026 00027 #include "SubIndex.tcc" 00028 #include "Word.h" 00029 #include "ElementList.tcc" 00030 #include "DocumentList.h" 00031 00032 // for plsa 00033 00051 class OrderHash { 00052 public: 00054 int _order; 00055 00057 int _hashOrder; 00058 }; 00059 00077 class SortOrderHash { 00078 public: 00079 00081 bool operator() (const OrderHash &x, const OrderHash &y) { 00082 return (x._hashOrder < y._hashOrder); 00083 } 00084 }; 00085 00107 class WordSubIndex : public SubIndex<Word> { 00108 public: 00109 00111 WordSubIndex(BlockStats *stats); 00112 ~WordSubIndex(void); 00113 00115 void buildWordList(void); 00116 00118 void buildDocumentList(void); 00119 00121 int wordCount(void); 00122 00124 int documentCount(void); 00125 00127 Word *wordStats(int word); 00128 00130 Document *documentStats(int document); 00131 00133 void generateIndexList(int minDocumentCount,int maxDocumentCount); 00134 00136 void generateIndexList(Word **queryTerms, int uniqueQueryTerms); 00137 00138 void hashHack(void); 00139 00141 int elementDocumentCount(int i); 00142 00144 FILE_POS_TYPE elementFilePosition(int i); 00145 00147 Word *indexStats(int word); 00148 00150 int indexElements(int i); 00151 00153 int indexLength(void); 00154 00156 int rowLength(void); 00157 00158 protected: 00160 ElementList<Word> *_wordList; 00161 00163 DocumentList *_documentList; 00164 00166 BlockStats *_stats; 00167 }; 00168 00169 #endif 00170