PageRenderTime 60ms CodeModel.GetById 8ms app.highlight 46ms RepoModel.GetById 2ms app.codeStats 0ms

/Source Code/Indexer/Indexer.vb

#
Visual Basic | 531 lines | 315 code | 112 blank | 104 comment | 0 complexity | 8d6cbd1cbdc784a882c4c892a307ad9c MD5 | raw file
  1Imports System.IO
  2
  3Public Class Indexer
  4
  5    Private Files As New Dictionary(Of Integer, String) ' Key = File index, Value = File name
  6    Private FileNumbers As New Dictionary(Of String, Integer) ' Key = File name, Value = File index
  7    Private LastAddedFileIndex As Integer = 0
  8
  9    Private Words As New Dictionary(Of Integer, String) ' Key = Word index, Value = Word
 10    Private WordNumbers As New Dictionary(Of String, Integer) ' Key = Word, Value = Word index
 11    Private LastAddedWordIndex As Integer = 0
 12
 13    Private Index As New Dictionary(Of Integer, WordOccurrences) ' Key = Word index
 14
 15    ''' <summary>
 16    ''' Returns the complete list of document numbers.
 17    ''' </summary>
 18    ReadOnly Property AllDocumentsNumbers() As Integer()
 19        Get
 20            Dim Result(FileNumbers.Count - 1) As Integer
 21            Dim I As Integer = 0
 22            For Each Document As KeyValuePair(Of Integer, String) In Files
 23                Result(I) = Document.Key
 24                I += 1
 25            Next
 26            Return Result
 27        End Get
 28    End Property
 29
 30    ''' <summary>
 31    ''' Returns the total number of documents indexed.
 32    ''' </summary>
 33    ReadOnly Property DocumentCount() As Integer
 34        Get
 35            Return Files.Count
 36        End Get
 37    End Property
 38
 39    ''' <summary>
 40    ''' Returns the name of the document, given its index.
 41    ''' </summary>
 42    ReadOnly Property DocumentName(ByVal index As Integer) As String
 43        Get
 44            If Files.ContainsKey(index) Then
 45                Return Files(index)
 46            Else
 47                Throw New ApplicationException("Document " + index.ToString + " does not exist in the index.")
 48            End If
 49        End Get
 50    End Property
 51
 52    ''' <summary>
 53    ''' Returns the numeric index of the document, given its name.
 54    ''' </summary>
 55    ReadOnly Property DocumentNumber(ByVal name As String) As Integer
 56        Get
 57            If FileNumbers.ContainsKey(name) Then
 58                Return FileNumbers(name)
 59            Else
 60                Throw New ApplicationException("The document '" + name + "' does not exist in the index.")
 61            End If
 62        End Get
 63    End Property
 64
 65    ''' <summary>
 66    ''' Returns the document indices of documents containing the specified words.
 67    ''' </summary>
 68    ''' <param name="word">The word to search for.</param>
 69    Function DocumentsContainingWord(ByVal word As String) As List(Of Integer)
 70
 71        Dim Result As New List(Of Integer)
 72        Dim WordIndex As Integer = -1
 73        If WordNumbers.TryGetValue(word.ToUpper, WordIndex) Then
 74
 75            Dim Occurrences As WordOccurrences = Nothing
 76            Dim Found As Boolean = Index.TryGetValue(WordIndex, Occurrences)
 77
 78            If Not Found Then
 79                Throw New Exception("The word " + word + " exists in the words collection but not the index itself.")
 80            Else
 81                For Each DocumentIndex As Integer In Index(WordIndex).Documents.Keys
 82                    Result.Add(DocumentIndex)
 83                Next
 84            End If
 85
 86        End If
 87        Return Result
 88
 89    End Function
 90
 91    ''' <summary>
 92    ''' Returns the number of times the given word occurs in the given document.
 93    ''' </summary>
 94    ''' <param name="documentIndex">The document's index to search in.</param>
 95    ''' <param name="word">The word to search for.</param>
 96    Function TermFrequency(ByVal documentIndex As Integer, ByVal word As String) As Integer
 97
 98        Dim Result As Integer = 0
 99
100        Dim WordIndex As Integer = -1
101        Dim WordFound As Boolean = WordNumbers.TryGetValue(word.ToUpper, WordIndex)
102        If WordFound Then
103
104            ' Get the document frequency and return it
105            Dim Occurrences As WordOccurrences = Index(WordIndex)
106            Occurrences.Documents.TryGetValue(documentIndex, Result)
107
108        End If
109
110        Return Result
111
112    End Function
113
114    ''' <summary>
115    ''' Empties the index.
116    ''' </summary>
117    Public Sub Clear()
118
119        Files.Clear()
120        FileNumbers.Clear()
121
122        Words.Clear()
123        WordNumbers.Clear()
124
125        LastAddedWordIndex = 0
126        LastAddedFileIndex = 0
127
128        Index.Clear()
129
130    End Sub
131
132    ''' <summary>
133    ''' Returns the unique words in the given file's contents.
134    ''' </summary>
135    ''' <param name="fileContents">The textual contents of a file.</param>
136    ''' <returns>A list of the unique words in the given text.</returns>
137    Private Function GetVocabulary(ByVal fileContents As String) As String()
138
139        fileContents = fileContents.ToUpper
140        Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
141        Dim Vocabulary As New SortedList(Of String, Integer)
142        For Each Word As String In WordsInFile
143            If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
144        Next
145        Dim Result(Vocabulary.Count - 1) As String
146        Vocabulary.Keys.CopyTo(Result, 0)
147        Return Result
148
149    End Function
150
151    ''' <summary>
152    ''' Returns the number of times each word occurs in the given document.
153    ''' </summary>
154    ''' <param name="fileContents">The document containing words to count.</param>
155    Private Function GetWordCounts(ByVal fileContents As String) As Dictionary(Of Integer, Integer)
156
157        ' Create a list of all the words, each set as occurring 0 times
158        Dim WordCounts As New Dictionary(Of Integer, Integer)
159        For Each Index As Integer In Words.Keys
160            WordCounts.Add(Index, 0)
161        Next
162
163        ' Count the number of occurrances of every word in this document
164        Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
165        For Each Word As String In WordsInFile
166            Dim WordIndex As Integer = WordNumbers(Word)
167            WordCounts(WordIndex) += 1
168        Next
169
170        ' Determine which words have a count greater than zero and include only those in the result
171        Dim WordsToInclude As New List(Of Integer)
172        For Each WordIndex As Integer In WordCounts.Keys
173            If WordCounts(WordIndex) > 0 Then WordsToInclude.Add(WordIndex)
174        Next
175        Dim Result As New Dictionary(Of Integer, Integer)
176        For Each WordIndex As Integer In WordsToInclude
177            Result.Add(WordIndex, WordCounts(WordIndex))
178        Next
179
180        Return Result
181
182    End Function
183
184    ''' <summary>
185    ''' Splits the document into words.
186    ''' </summary>
187    ''' <param name="fileContents">The document to split into words.</param>
188    ''' <returns>The document, split into words.</returns>
189    Private Function SplitIntoFixedWords(ByVal fileContents As String) As String()
190
191        fileContents = fileContents.ToUpper
192        Dim Words() As String = fileContents.Split(" "c, "!"c, "."c, ":"c, """"c, "-"c, Environment.NewLine)
193        Dim WordsToReturn(Words.Length - 1) As String
194        Dim Index As Integer = 0
195        For Each Word As String In Words
196            Word = FixWord(Word)
197            If Word <> "" Then
198                WordsToReturn(Index) = Word
199                Index += 1
200            End If
201        Next
202
203        ReDim Preserve WordsToReturn(Index - 1)
204        Return WordsToReturn
205
206    End Function
207
208    ''' <summary>
209    ''' Find each file in the given directory and index it.
210    ''' </summary>
211    ''' <param name="directory">The directory whose files to index.</param>
212    ''' <remarks>Files are not recursed, and so only the files in the specified directory itself will be indexed.</remarks>
213    Public Sub AddDirectoryToIndex(ByVal directory As String)
214
215        ' Get the file names
216        For Each Filename As String In IO.Directory.GetFiles(directory)
217            AddFileName(Filename)
218        Next
219
220        ' Phase 1: Build a global vocabulary
221        Dim Vocabulary As New SortedList(Of String, Integer)
222        For Each File As KeyValuePair(Of Integer, String) In Files
223
224            Dim Contents As String = GetFileContents(File.Value)
225
226            ' Build a vocabulary for the file and add it to the complete list of words
227            Dim FileVocabulary As String() = GetVocabulary(Contents)
228            For Each Word As String In FileVocabulary
229                If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
230            Next
231
232        Next
233
234        ' Copy the complete list of words into the global vocabulary
235        For Each Word As String In Vocabulary.Keys
236            AddWordToVocabulary(Word)
237        Next
238
239        ' Phase 2: Determine how many times each word occurs in each document
240        For Each File As KeyValuePair(Of Integer, String) In Files
241
242            ' Count each word as it is read in
243            Dim Contents As String = GetFileContents(File.Value)
244            Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(Contents)
245
246            ' Store number of times each word occurs in the document (by merging with the global index)
247            MergeDocumentWordCount(FileNumbers(File.Value), DocumentWordCount)
248
249        Next
250
251    End Sub
252
253    Public Sub AddFileToIndex(ByVal fileName As String)
254        AddFileContentsToIndex(fileName, GetFileContents(fileName))
255    End Sub
256
257    Public Sub AddFileToIndex(ByVal virtualFilename As String, ByVal fileName As String)
258        AddFileContentsToIndex(virtualFilename, GetFileContents(fileName))
259    End Sub
260
261    Public Sub AddFileContentsToIndex(ByVal virtualFilename As String, ByVal contents As String)
262
263        ' Add to the list of files
264        Dim FileIndex As Integer = AddFileName(virtualFilename)
265
266        ' Build a vocabulary for the file and add it to the complete list of words
267        Dim Vocabulary As New SortedList(Of String, Integer)
268        For Each Word As String In GetVocabulary(contents)
269            If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
270        Next
271
272        ' Copy the complete list of words into the global vocabulary
273        For Each Word As String In Vocabulary.Keys
274            If Not WordNumbers.ContainsKey(Word) Then AddWordToVocabulary(Word)
275        Next
276
277        ' Count each word as it is read in
278        Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(contents)
279
280        ' Store number of times each word occurs in the document (by merging with the global index)
281        MergeDocumentWordCount(FileIndex, DocumentWordCount)
282
283    End Sub
284
285    Public Sub RemoveDocumentFromIndex(ByVal virtualFilename As String)
286
287        ' Check the file exists
288        If Not FileNumbers.ContainsKey(virtualFilename) Then
289            Throw New ApplicationException("The document '" + virtualFilename + "' does not exist in the index.")
290        End If
291
292        ' Remove the file from the list of files
293        Dim DocumentIndex As Integer = FileNumbers(virtualFilename)
294        Files.Remove(DocumentIndex)
295        FileNumbers.Remove(virtualFilename)
296
297        ' Process each word in the index
298        Dim WordsToRemove As New List(Of Integer)
299        For Each Word As KeyValuePair(Of Integer, WordOccurrences) In Index
300            Dim WordIndex As Integer = Word.Key
301            Dim Occurrences As WordOccurrences = Word.Value
302
303            ' Remove the word from the index
304            If Occurrences.Documents.ContainsKey(DocumentIndex) Then
305
306                ' Remove the document from the word
307                Dim OccurrencesInDocument As Integer = Occurrences.Documents(DocumentIndex)
308                Occurrences.Occurrences -= OccurrencesInDocument
309                Occurrences.Documents.Remove(DocumentIndex)
310
311                ' This word no longer exists in any document
312                If Occurrences.Documents.Count = 0 Then WordsToRemove.Add(WordIndex)
313
314            End If
315
316        Next
317
318        ' Remove WordOccurrences objects that are no longer needed, as well as unused words in vocabulary
319        For Each WordIndex As Integer In WordsToRemove
320            Index.Remove(WordIndex)
321            WordNumbers.Remove(Words(WordIndex))
322            Words.Remove(WordIndex)
323        Next
324
325    End Sub
326
327    ''' <summary>
328    ''' Loads all of the text in the given file, and return it.
329    ''' </summary>
330    ''' <param name="filename">The file whose contents to retrieve.</param>
331    Private Function GetFileContents(ByVal filename As String) As String
332
333        Dim File As New IO.StreamReader(filename)
334        Dim Contents As String = File.ReadToEnd
335        File.Close()
336        File.Dispose()
337        Return Contents
338
339    End Function
340
341    ''' <summary>
342    ''' Strips the words of all non-alphanumeric characters.
343    ''' </summary>
344    ''' <param name="word">The word to 'fix'.</param>
345    ''' <returns>The word without any symbol characters.</returns>
346    Private Function FixWord(ByVal word As String) As String
347
348        Dim Result As String = ""
349        For Each C As Char In word
350            Select Case C
351                Case "a"c To "z"c, "A"c To "Z"c, "0"c To "9"c : Result += C
352            End Select
353        Next
354        Return Result
355
356    End Function
357
358    ''' <summary>
359    ''' Merges the document words count into the global index.
360    ''' </summary>
361    ''' <param name="documentFilenameIndex">The index of the document whose word counts to merge.</param>
362    ''' <param name="documentWordCount">A list of word indices and word counts.</param>
363    Private Sub MergeDocumentWordCount(ByVal documentFilenameIndex As Integer, _
364                                       ByVal documentWordCount As Dictionary(Of Integer, Integer))
365
366        For Each WordIndex As Integer In documentWordCount.Keys
367            Dim WordOccurrences As Integer = documentWordCount(WordIndex)
368            If Index.ContainsKey(WordIndex) Then
369                With Index(WordIndex)
370                    .Occurrences += WordOccurrences
371                    .Documents.Add(documentFilenameIndex, WordOccurrences)
372                End With
373            Else
374                Index.Add(WordIndex, New WordOccurrences(WordOccurrences, documentFilenameIndex))
375            End If
376        Next
377
378    End Sub
379
380    Private Function AddWordToVocabulary(ByVal word As String) As Integer
381
382        ' Determine the next word index to use
383        Dim WordIndex As Integer = LastAddedWordIndex
384        While Words.ContainsKey(WordIndex)
385            WordIndex += 1
386        End While
387        LastAddedWordIndex = WordIndex
388
389        ' Add the word to the vocabulary
390        Words.Add(WordIndex, word)
391        WordNumbers.Add(word, WordIndex)
392
393        ' Return the index
394        Return WordIndex
395
396    End Function
397
398    Private Function AddFileName(ByVal fileName As String) As Integer
399
400        ' Determine the next file name index to use
401        Dim FileNameIndex As Integer = LastAddedFileIndex
402        While Files.ContainsKey(FileNameIndex)
403            FileNameIndex += 1
404        End While
405        LastAddedFileIndex = FileNameIndex
406
407        ' Add the word to the vocabulary
408        Files.Add(FileNameIndex, fileName)
409        FileNumbers.Add(fileName, FileNameIndex)
410
411        ' Return the index
412        Return FileNameIndex
413
414    End Function
415
416    Public Shared Function LoadIndex(ByVal fileName As String) As Indexer
417
418        Dim Result As New Indexer
419
420        ' Get list of files
421        Using File As New BinaryReader(New FileStream(fileName, FileMode.Open))
422
423            Dim FileCount As Integer = File.ReadInt32
424            Result.Files.Clear()
425            Result.FileNumbers.Clear()
426            For I As Integer = 1 To FileCount
427                Dim DocumentIndex As Integer = File.ReadInt32
428                Dim DocumentName As String = File.ReadString
429                Result.Files.Add(DocumentIndex, DocumentName)
430                Result.FileNumbers.Add(DocumentName, DocumentIndex)
431            Next
432
433            ' Get global vocabulary
434            Dim VocabularyWordCount As Integer = File.ReadInt32
435            Result.Words.Clear()
436            Result.WordNumbers.Clear()
437            For I As Integer = 1 To VocabularyWordCount
438                Dim WordIndex As Integer = File.ReadInt32
439                Dim Word As String = File.ReadString
440                Result.Words.Add(WordIndex, Word)
441                Result.WordNumbers.Add(Word, WordIndex)
442            Next
443
444            ' Get the main part of the index
445            Dim IndexEntriesCount As Integer = File.ReadInt32
446            Result.Index.Clear()
447            For A As Integer = 1 To IndexEntriesCount
448                Dim MainKey As Integer = File.ReadInt32
449                Dim Occurrences As Integer = File.ReadInt32
450                Dim DocumentCount As Integer = File.ReadInt32
451                Dim Documents(DocumentCount - 1) As Integer
452                For B As Integer = 0 To DocumentCount - 1
453                    Documents(B) = File.ReadInt32
454                Next
455                Result.Index.Add(MainKey, New WordOccurrences(Occurrences, Documents))
456            Next
457
458            ' Close the file
459            File.Close()
460
461        End Using
462
463        Return Result
464
465    End Function
466
467    Public Sub Save(ByVal fileName As String)
468
469        ' Write list of files
470        Dim File As New BinaryWriter(New FileStream(fileName, FileMode.Create))
471        File.Write(Files.Count)
472        For Each IndexedFilename As KeyValuePair(Of Integer, String) In Files
473            File.Write(IndexedFilename.Key)
474            File.Write(IndexedFilename.Value)
475        Next
476
477        ' Write global vocabulary
478        File.Write(Words.Count)
479        For Each Word As KeyValuePair(Of Integer, String) In Words
480            File.Write(Word.Key)
481            File.Write(Word.Value)
482        Next
483
484        ' Write the main part of the index
485        File.Write(Index.Count)
486        For Each IndexEntry As KeyValuePair(Of Integer, WordOccurrences) In Index
487            File.Write(IndexEntry.Key)
488            File.Write(IndexEntry.Value.Occurrences)
489            File.Write(IndexEntry.Value.Documents.Count)
490            For Each DocumentEntry As KeyValuePair(Of Integer, Integer) In IndexEntry.Value.Documents
491                File.Write(DocumentEntry.Key)
492            Next
493        Next
494
495        ' Close the file
496        File.Close()
497
498    End Sub
499
500    ''' <summary>
501    ''' Returns a string describing the index in full.
502    ''' </summary>
503    ''' <remarks>
504    ''' The index is returned in the following format:
505    ''' 
506    ''' WORD: TotalOccurrances {DocumentFilename|DocumentOccurrances,...}
507    ''' </remarks>
508    Overrides Function ToString() As String
509
510        ' Write out every word...
511        Dim Result As New Text.StringBuilder
512        For Each WordIndex As Integer In Index.Keys
513            With Index(WordIndex)
514                Result.Append(Words(WordIndex)).Append(": ").Append(.Occurrences).Append(" {")
515
516                ' ...and every occurrance of the word...
517                Dim CommaRequired As Boolean = False
518                For Each DocumentIndex As Integer In .Documents.Keys
519                    Dim DocumentFilename As String = Files(DocumentIndex)
520                    Dim DocumentOccurrances As Integer = .Documents(DocumentIndex)
521                    If CommaRequired Then Result.Append(",") Else CommaRequired = True
522                    Result.Append(DocumentFilename).Append("|").Append(DocumentOccurrances.ToString)
523                Next
524            End With
525            Result.Append("}").Append(Environment.NewLine)
526        Next
527        Return Result.ToString
528
529    End Function
530
531End Class