/Source Code/Indexer/Indexer.vb
Visual Basic | 531 lines | 315 code | 112 blank | 104 comment | 0 complexity | 8d6cbd1cbdc784a882c4c892a307ad9c MD5 | raw file
1Imports System.IO 2 3Public Class Indexer 4 5 Private Files As New Dictionary(Of Integer, String) ' Key = File index, Value = File name 6 Private FileNumbers As New Dictionary(Of String, Integer) ' Key = File name, Value = File index 7 Private LastAddedFileIndex As Integer = 0 8 9 Private Words As New Dictionary(Of Integer, String) ' Key = Word index, Value = Word 10 Private WordNumbers As New Dictionary(Of String, Integer) ' Key = Word, Value = Word index 11 Private LastAddedWordIndex As Integer = 0 12 13 Private Index As New Dictionary(Of Integer, WordOccurrences) ' Key = Word index 14 15 ''' <summary> 16 ''' Returns the complete list of document numbers. 17 ''' </summary> 18 ReadOnly Property AllDocumentsNumbers() As Integer() 19 Get 20 Dim Result(FileNumbers.Count - 1) As Integer 21 Dim I As Integer = 0 22 For Each Document As KeyValuePair(Of Integer, String) In Files 23 Result(I) = Document.Key 24 I += 1 25 Next 26 Return Result 27 End Get 28 End Property 29 30 ''' <summary> 31 ''' Returns the total number of documents indexed. 32 ''' </summary> 33 ReadOnly Property DocumentCount() As Integer 34 Get 35 Return Files.Count 36 End Get 37 End Property 38 39 ''' <summary> 40 ''' Returns the name of the document, given its index. 41 ''' </summary> 42 ReadOnly Property DocumentName(ByVal index As Integer) As String 43 Get 44 If Files.ContainsKey(index) Then 45 Return Files(index) 46 Else 47 Throw New ApplicationException("Document " + index.ToString + " does not exist in the index.") 48 End If 49 End Get 50 End Property 51 52 ''' <summary> 53 ''' Returns the numeric index of the document, given its name. 54 ''' </summary> 55 ReadOnly Property DocumentNumber(ByVal name As String) As Integer 56 Get 57 If FileNumbers.ContainsKey(name) Then 58 Return FileNumbers(name) 59 Else 60 Throw New ApplicationException("The document '" + name + "' does not exist in the index.") 61 End If 62 End Get 63 End Property 64 65 ''' <summary> 66 ''' Returns the document indices of documents containing the specified words. 67 ''' </summary> 68 ''' <param name="word">The word to search for.</param> 69 Function DocumentsContainingWord(ByVal word As String) As List(Of Integer) 70 71 Dim Result As New List(Of Integer) 72 Dim WordIndex As Integer = -1 73 If WordNumbers.TryGetValue(word.ToUpper, WordIndex) Then 74 75 Dim Occurrences As WordOccurrences = Nothing 76 Dim Found As Boolean = Index.TryGetValue(WordIndex, Occurrences) 77 78 If Not Found Then 79 Throw New Exception("The word " + word + " exists in the words collection but not the index itself.") 80 Else 81 For Each DocumentIndex As Integer In Index(WordIndex).Documents.Keys 82 Result.Add(DocumentIndex) 83 Next 84 End If 85 86 End If 87 Return Result 88 89 End Function 90 91 ''' <summary> 92 ''' Returns the number of times the given word occurs in the given document. 93 ''' </summary> 94 ''' <param name="documentIndex">The document's index to search in.</param> 95 ''' <param name="word">The word to search for.</param> 96 Function TermFrequency(ByVal documentIndex As Integer, ByVal word As String) As Integer 97 98 Dim Result As Integer = 0 99 100 Dim WordIndex As Integer = -1 101 Dim WordFound As Boolean = WordNumbers.TryGetValue(word.ToUpper, WordIndex) 102 If WordFound Then 103 104 ' Get the document frequency and return it 105 Dim Occurrences As WordOccurrences = Index(WordIndex) 106 Occurrences.Documents.TryGetValue(documentIndex, Result) 107 108 End If 109 110 Return Result 111 112 End Function 113 114 ''' <summary> 115 ''' Empties the index. 116 ''' </summary> 117 Public Sub Clear() 118 119 Files.Clear() 120 FileNumbers.Clear() 121 122 Words.Clear() 123 WordNumbers.Clear() 124 125 LastAddedWordIndex = 0 126 LastAddedFileIndex = 0 127 128 Index.Clear() 129 130 End Sub 131 132 ''' <summary> 133 ''' Returns the unique words in the given file's contents. 134 ''' </summary> 135 ''' <param name="fileContents">The textual contents of a file.</param> 136 ''' <returns>A list of the unique words in the given text.</returns> 137 Private Function GetVocabulary(ByVal fileContents As String) As String() 138 139 fileContents = fileContents.ToUpper 140 Dim WordsInFile As String() = SplitIntoFixedWords(fileContents) 141 Dim Vocabulary As New SortedList(Of String, Integer) 142 For Each Word As String In WordsInFile 143 If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0) 144 Next 145 Dim Result(Vocabulary.Count - 1) As String 146 Vocabulary.Keys.CopyTo(Result, 0) 147 Return Result 148 149 End Function 150 151 ''' <summary> 152 ''' Returns the number of times each word occurs in the given document. 153 ''' </summary> 154 ''' <param name="fileContents">The document containing words to count.</param> 155 Private Function GetWordCounts(ByVal fileContents As String) As Dictionary(Of Integer, Integer) 156 157 ' Create a list of all the words, each set as occurring 0 times 158 Dim WordCounts As New Dictionary(Of Integer, Integer) 159 For Each Index As Integer In Words.Keys 160 WordCounts.Add(Index, 0) 161 Next 162 163 ' Count the number of occurrances of every word in this document 164 Dim WordsInFile As String() = SplitIntoFixedWords(fileContents) 165 For Each Word As String In WordsInFile 166 Dim WordIndex As Integer = WordNumbers(Word) 167 WordCounts(WordIndex) += 1 168 Next 169 170 ' Determine which words have a count greater than zero and include only those in the result 171 Dim WordsToInclude As New List(Of Integer) 172 For Each WordIndex As Integer In WordCounts.Keys 173 If WordCounts(WordIndex) > 0 Then WordsToInclude.Add(WordIndex) 174 Next 175 Dim Result As New Dictionary(Of Integer, Integer) 176 For Each WordIndex As Integer In WordsToInclude 177 Result.Add(WordIndex, WordCounts(WordIndex)) 178 Next 179 180 Return Result 181 182 End Function 183 184 ''' <summary> 185 ''' Splits the document into words. 186 ''' </summary> 187 ''' <param name="fileContents">The document to split into words.</param> 188 ''' <returns>The document, split into words.</returns> 189 Private Function SplitIntoFixedWords(ByVal fileContents As String) As String() 190 191 fileContents = fileContents.ToUpper 192 Dim Words() As String = fileContents.Split(" "c, "!"c, "."c, ":"c, """"c, "-"c, Environment.NewLine) 193 Dim WordsToReturn(Words.Length - 1) As String 194 Dim Index As Integer = 0 195 For Each Word As String In Words 196 Word = FixWord(Word) 197 If Word <> "" Then 198 WordsToReturn(Index) = Word 199 Index += 1 200 End If 201 Next 202 203 ReDim Preserve WordsToReturn(Index - 1) 204 Return WordsToReturn 205 206 End Function 207 208 ''' <summary> 209 ''' Find each file in the given directory and index it. 210 ''' </summary> 211 ''' <param name="directory">The directory whose files to index.</param> 212 ''' <remarks>Files are not recursed, and so only the files in the specified directory itself will be indexed.</remarks> 213 Public Sub AddDirectoryToIndex(ByVal directory As String) 214 215 ' Get the file names 216 For Each Filename As String In IO.Directory.GetFiles(directory) 217 AddFileName(Filename) 218 Next 219 220 ' Phase 1: Build a global vocabulary 221 Dim Vocabulary As New SortedList(Of String, Integer) 222 For Each File As KeyValuePair(Of Integer, String) In Files 223 224 Dim Contents As String = GetFileContents(File.Value) 225 226 ' Build a vocabulary for the file and add it to the complete list of words 227 Dim FileVocabulary As String() = GetVocabulary(Contents) 228 For Each Word As String In FileVocabulary 229 If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0) 230 Next 231 232 Next 233 234 ' Copy the complete list of words into the global vocabulary 235 For Each Word As String In Vocabulary.Keys 236 AddWordToVocabulary(Word) 237 Next 238 239 ' Phase 2: Determine how many times each word occurs in each document 240 For Each File As KeyValuePair(Of Integer, String) In Files 241 242 ' Count each word as it is read in 243 Dim Contents As String = GetFileContents(File.Value) 244 Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(Contents) 245 246 ' Store number of times each word occurs in the document (by merging with the global index) 247 MergeDocumentWordCount(FileNumbers(File.Value), DocumentWordCount) 248 249 Next 250 251 End Sub 252 253 Public Sub AddFileToIndex(ByVal fileName As String) 254 AddFileContentsToIndex(fileName, GetFileContents(fileName)) 255 End Sub 256 257 Public Sub AddFileToIndex(ByVal virtualFilename As String, ByVal fileName As String) 258 AddFileContentsToIndex(virtualFilename, GetFileContents(fileName)) 259 End Sub 260 261 Public Sub AddFileContentsToIndex(ByVal virtualFilename As String, ByVal contents As String) 262 263 ' Add to the list of files 264 Dim FileIndex As Integer = AddFileName(virtualFilename) 265 266 ' Build a vocabulary for the file and add it to the complete list of words 267 Dim Vocabulary As New SortedList(Of String, Integer) 268 For Each Word As String In GetVocabulary(contents) 269 If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0) 270 Next 271 272 ' Copy the complete list of words into the global vocabulary 273 For Each Word As String In Vocabulary.Keys 274 If Not WordNumbers.ContainsKey(Word) Then AddWordToVocabulary(Word) 275 Next 276 277 ' Count each word as it is read in 278 Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(contents) 279 280 ' Store number of times each word occurs in the document (by merging with the global index) 281 MergeDocumentWordCount(FileIndex, DocumentWordCount) 282 283 End Sub 284 285 Public Sub RemoveDocumentFromIndex(ByVal virtualFilename As String) 286 287 ' Check the file exists 288 If Not FileNumbers.ContainsKey(virtualFilename) Then 289 Throw New ApplicationException("The document '" + virtualFilename + "' does not exist in the index.") 290 End If 291 292 ' Remove the file from the list of files 293 Dim DocumentIndex As Integer = FileNumbers(virtualFilename) 294 Files.Remove(DocumentIndex) 295 FileNumbers.Remove(virtualFilename) 296 297 ' Process each word in the index 298 Dim WordsToRemove As New List(Of Integer) 299 For Each Word As KeyValuePair(Of Integer, WordOccurrences) In Index 300 Dim WordIndex As Integer = Word.Key 301 Dim Occurrences As WordOccurrences = Word.Value 302 303 ' Remove the word from the index 304 If Occurrences.Documents.ContainsKey(DocumentIndex) Then 305 306 ' Remove the document from the word 307 Dim OccurrencesInDocument As Integer = Occurrences.Documents(DocumentIndex) 308 Occurrences.Occurrences -= OccurrencesInDocument 309 Occurrences.Documents.Remove(DocumentIndex) 310 311 ' This word no longer exists in any document 312 If Occurrences.Documents.Count = 0 Then WordsToRemove.Add(WordIndex) 313 314 End If 315 316 Next 317 318 ' Remove WordOccurrences objects that are no longer needed, as well as unused words in vocabulary 319 For Each WordIndex As Integer In WordsToRemove 320 Index.Remove(WordIndex) 321 WordNumbers.Remove(Words(WordIndex)) 322 Words.Remove(WordIndex) 323 Next 324 325 End Sub 326 327 ''' <summary> 328 ''' Loads all of the text in the given file, and return it. 329 ''' </summary> 330 ''' <param name="filename">The file whose contents to retrieve.</param> 331 Private Function GetFileContents(ByVal filename As String) As String 332 333 Dim File As New IO.StreamReader(filename) 334 Dim Contents As String = File.ReadToEnd 335 File.Close() 336 File.Dispose() 337 Return Contents 338 339 End Function 340 341 ''' <summary> 342 ''' Strips the words of all non-alphanumeric characters. 343 ''' </summary> 344 ''' <param name="word">The word to 'fix'.</param> 345 ''' <returns>The word without any symbol characters.</returns> 346 Private Function FixWord(ByVal word As String) As String 347 348 Dim Result As String = "" 349 For Each C As Char In word 350 Select Case C 351 Case "a"c To "z"c, "A"c To "Z"c, "0"c To "9"c : Result += C 352 End Select 353 Next 354 Return Result 355 356 End Function 357 358 ''' <summary> 359 ''' Merges the document words count into the global index. 360 ''' </summary> 361 ''' <param name="documentFilenameIndex">The index of the document whose word counts to merge.</param> 362 ''' <param name="documentWordCount">A list of word indices and word counts.</param> 363 Private Sub MergeDocumentWordCount(ByVal documentFilenameIndex As Integer, _ 364 ByVal documentWordCount As Dictionary(Of Integer, Integer)) 365 366 For Each WordIndex As Integer In documentWordCount.Keys 367 Dim WordOccurrences As Integer = documentWordCount(WordIndex) 368 If Index.ContainsKey(WordIndex) Then 369 With Index(WordIndex) 370 .Occurrences += WordOccurrences 371 .Documents.Add(documentFilenameIndex, WordOccurrences) 372 End With 373 Else 374 Index.Add(WordIndex, New WordOccurrences(WordOccurrences, documentFilenameIndex)) 375 End If 376 Next 377 378 End Sub 379 380 Private Function AddWordToVocabulary(ByVal word As String) As Integer 381 382 ' Determine the next word index to use 383 Dim WordIndex As Integer = LastAddedWordIndex 384 While Words.ContainsKey(WordIndex) 385 WordIndex += 1 386 End While 387 LastAddedWordIndex = WordIndex 388 389 ' Add the word to the vocabulary 390 Words.Add(WordIndex, word) 391 WordNumbers.Add(word, WordIndex) 392 393 ' Return the index 394 Return WordIndex 395 396 End Function 397 398 Private Function AddFileName(ByVal fileName As String) As Integer 399 400 ' Determine the next file name index to use 401 Dim FileNameIndex As Integer = LastAddedFileIndex 402 While Files.ContainsKey(FileNameIndex) 403 FileNameIndex += 1 404 End While 405 LastAddedFileIndex = FileNameIndex 406 407 ' Add the word to the vocabulary 408 Files.Add(FileNameIndex, fileName) 409 FileNumbers.Add(fileName, FileNameIndex) 410 411 ' Return the index 412 Return FileNameIndex 413 414 End Function 415 416 Public Shared Function LoadIndex(ByVal fileName As String) As Indexer 417 418 Dim Result As New Indexer 419 420 ' Get list of files 421 Using File As New BinaryReader(New FileStream(fileName, FileMode.Open)) 422 423 Dim FileCount As Integer = File.ReadInt32 424 Result.Files.Clear() 425 Result.FileNumbers.Clear() 426 For I As Integer = 1 To FileCount 427 Dim DocumentIndex As Integer = File.ReadInt32 428 Dim DocumentName As String = File.ReadString 429 Result.Files.Add(DocumentIndex, DocumentName) 430 Result.FileNumbers.Add(DocumentName, DocumentIndex) 431 Next 432 433 ' Get global vocabulary 434 Dim VocabularyWordCount As Integer = File.ReadInt32 435 Result.Words.Clear() 436 Result.WordNumbers.Clear() 437 For I As Integer = 1 To VocabularyWordCount 438 Dim WordIndex As Integer = File.ReadInt32 439 Dim Word As String = File.ReadString 440 Result.Words.Add(WordIndex, Word) 441 Result.WordNumbers.Add(Word, WordIndex) 442 Next 443 444 ' Get the main part of the index 445 Dim IndexEntriesCount As Integer = File.ReadInt32 446 Result.Index.Clear() 447 For A As Integer = 1 To IndexEntriesCount 448 Dim MainKey As Integer = File.ReadInt32 449 Dim Occurrences As Integer = File.ReadInt32 450 Dim DocumentCount As Integer = File.ReadInt32 451 Dim Documents(DocumentCount - 1) As Integer 452 For B As Integer = 0 To DocumentCount - 1 453 Documents(B) = File.ReadInt32 454 Next 455 Result.Index.Add(MainKey, New WordOccurrences(Occurrences, Documents)) 456 Next 457 458 ' Close the file 459 File.Close() 460 461 End Using 462 463 Return Result 464 465 End Function 466 467 Public Sub Save(ByVal fileName As String) 468 469 ' Write list of files 470 Dim File As New BinaryWriter(New FileStream(fileName, FileMode.Create)) 471 File.Write(Files.Count) 472 For Each IndexedFilename As KeyValuePair(Of Integer, String) In Files 473 File.Write(IndexedFilename.Key) 474 File.Write(IndexedFilename.Value) 475 Next 476 477 ' Write global vocabulary 478 File.Write(Words.Count) 479 For Each Word As KeyValuePair(Of Integer, String) In Words 480 File.Write(Word.Key) 481 File.Write(Word.Value) 482 Next 483 484 ' Write the main part of the index 485 File.Write(Index.Count) 486 For Each IndexEntry As KeyValuePair(Of Integer, WordOccurrences) In Index 487 File.Write(IndexEntry.Key) 488 File.Write(IndexEntry.Value.Occurrences) 489 File.Write(IndexEntry.Value.Documents.Count) 490 For Each DocumentEntry As KeyValuePair(Of Integer, Integer) In IndexEntry.Value.Documents 491 File.Write(DocumentEntry.Key) 492 Next 493 Next 494 495 ' Close the file 496 File.Close() 497 498 End Sub 499 500 ''' <summary> 501 ''' Returns a string describing the index in full. 502 ''' </summary> 503 ''' <remarks> 504 ''' The index is returned in the following format: 505 ''' 506 ''' WORD: TotalOccurrances {DocumentFilename|DocumentOccurrances,...} 507 ''' </remarks> 508 Overrides Function ToString() As String 509 510 ' Write out every word... 511 Dim Result As New Text.StringBuilder 512 For Each WordIndex As Integer In Index.Keys 513 With Index(WordIndex) 514 Result.Append(Words(WordIndex)).Append(": ").Append(.Occurrences).Append(" {") 515 516 ' ...and every occurrance of the word... 517 Dim CommaRequired As Boolean = False 518 For Each DocumentIndex As Integer In .Documents.Keys 519 Dim DocumentFilename As String = Files(DocumentIndex) 520 Dim DocumentOccurrances As Integer = .Documents(DocumentIndex) 521 If CommaRequired Then Result.Append(",") Else CommaRequired = True 522 Result.Append(DocumentFilename).Append("|").Append(DocumentOccurrances.ToString) 523 Next 524 End With 525 Result.Append("}").Append(Environment.NewLine) 526 Next 527 Return Result.ToString 528 529 End Function 530 531End Class