/Source Code/Indexer/Indexer.vb
Visual Basic | 531 lines | 315 code | 112 blank | 104 comment | 0 complexity | 8d6cbd1cbdc784a882c4c892a307ad9c MD5 | raw file
- Imports System.IO
-
- Public Class Indexer
-
- Private Files As New Dictionary(Of Integer, String) ' Key = File index, Value = File name
- Private FileNumbers As New Dictionary(Of String, Integer) ' Key = File name, Value = File index
- Private LastAddedFileIndex As Integer = 0
-
- Private Words As New Dictionary(Of Integer, String) ' Key = Word index, Value = Word
- Private WordNumbers As New Dictionary(Of String, Integer) ' Key = Word, Value = Word index
- Private LastAddedWordIndex As Integer = 0
-
- Private Index As New Dictionary(Of Integer, WordOccurrences) ' Key = Word index
-
- ''' <summary>
- ''' Returns the complete list of document numbers.
- ''' </summary>
- ReadOnly Property AllDocumentsNumbers() As Integer()
- Get
- Dim Result(FileNumbers.Count - 1) As Integer
- Dim I As Integer = 0
- For Each Document As KeyValuePair(Of Integer, String) In Files
- Result(I) = Document.Key
- I += 1
- Next
- Return Result
- End Get
- End Property
-
- ''' <summary>
- ''' Returns the total number of documents indexed.
- ''' </summary>
- ReadOnly Property DocumentCount() As Integer
- Get
- Return Files.Count
- End Get
- End Property
-
- ''' <summary>
- ''' Returns the name of the document, given its index.
- ''' </summary>
- ReadOnly Property DocumentName(ByVal index As Integer) As String
- Get
- If Files.ContainsKey(index) Then
- Return Files(index)
- Else
- Throw New ApplicationException("Document " + index.ToString + " does not exist in the index.")
- End If
- End Get
- End Property
-
- ''' <summary>
- ''' Returns the numeric index of the document, given its name.
- ''' </summary>
- ReadOnly Property DocumentNumber(ByVal name As String) As Integer
- Get
- If FileNumbers.ContainsKey(name) Then
- Return FileNumbers(name)
- Else
- Throw New ApplicationException("The document '" + name + "' does not exist in the index.")
- End If
- End Get
- End Property
-
- ''' <summary>
- ''' Returns the document indices of documents containing the specified words.
- ''' </summary>
- ''' <param name="word">The word to search for.</param>
- Function DocumentsContainingWord(ByVal word As String) As List(Of Integer)
-
- Dim Result As New List(Of Integer)
- Dim WordIndex As Integer = -1
- If WordNumbers.TryGetValue(word.ToUpper, WordIndex) Then
-
- Dim Occurrences As WordOccurrences = Nothing
- Dim Found As Boolean = Index.TryGetValue(WordIndex, Occurrences)
-
- If Not Found Then
- Throw New Exception("The word " + word + " exists in the words collection but not the index itself.")
- Else
- For Each DocumentIndex As Integer In Index(WordIndex).Documents.Keys
- Result.Add(DocumentIndex)
- Next
- End If
-
- End If
- Return Result
-
- End Function
-
- ''' <summary>
- ''' Returns the number of times the given word occurs in the given document.
- ''' </summary>
- ''' <param name="documentIndex">The document's index to search in.</param>
- ''' <param name="word">The word to search for.</param>
- Function TermFrequency(ByVal documentIndex As Integer, ByVal word As String) As Integer
-
- Dim Result As Integer = 0
-
- Dim WordIndex As Integer = -1
- Dim WordFound As Boolean = WordNumbers.TryGetValue(word.ToUpper, WordIndex)
- If WordFound Then
-
- ' Get the document frequency and return it
- Dim Occurrences As WordOccurrences = Index(WordIndex)
- Occurrences.Documents.TryGetValue(documentIndex, Result)
-
- End If
-
- Return Result
-
- End Function
-
- ''' <summary>
- ''' Empties the index.
- ''' </summary>
- Public Sub Clear()
-
- Files.Clear()
- FileNumbers.Clear()
-
- Words.Clear()
- WordNumbers.Clear()
-
- LastAddedWordIndex = 0
- LastAddedFileIndex = 0
-
- Index.Clear()
-
- End Sub
-
- ''' <summary>
- ''' Returns the unique words in the given file's contents.
- ''' </summary>
- ''' <param name="fileContents">The textual contents of a file.</param>
- ''' <returns>A list of the unique words in the given text.</returns>
- Private Function GetVocabulary(ByVal fileContents As String) As String()
-
- fileContents = fileContents.ToUpper
- Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
- Dim Vocabulary As New SortedList(Of String, Integer)
- For Each Word As String In WordsInFile
- If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
- Next
- Dim Result(Vocabulary.Count - 1) As String
- Vocabulary.Keys.CopyTo(Result, 0)
- Return Result
-
- End Function
-
- ''' <summary>
- ''' Returns the number of times each word occurs in the given document.
- ''' </summary>
- ''' <param name="fileContents">The document containing words to count.</param>
- Private Function GetWordCounts(ByVal fileContents As String) As Dictionary(Of Integer, Integer)
-
- ' Create a list of all the words, each set as occurring 0 times
- Dim WordCounts As New Dictionary(Of Integer, Integer)
- For Each Index As Integer In Words.Keys
- WordCounts.Add(Index, 0)
- Next
-
- ' Count the number of occurrances of every word in this document
- Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
- For Each Word As String In WordsInFile
- Dim WordIndex As Integer = WordNumbers(Word)
- WordCounts(WordIndex) += 1
- Next
-
- ' Determine which words have a count greater than zero and include only those in the result
- Dim WordsToInclude As New List(Of Integer)
- For Each WordIndex As Integer In WordCounts.Keys
- If WordCounts(WordIndex) > 0 Then WordsToInclude.Add(WordIndex)
- Next
- Dim Result As New Dictionary(Of Integer, Integer)
- For Each WordIndex As Integer In WordsToInclude
- Result.Add(WordIndex, WordCounts(WordIndex))
- Next
-
- Return Result
-
- End Function
-
- ''' <summary>
- ''' Splits the document into words.
- ''' </summary>
- ''' <param name="fileContents">The document to split into words.</param>
- ''' <returns>The document, split into words.</returns>
- Private Function SplitIntoFixedWords(ByVal fileContents As String) As String()
-
- fileContents = fileContents.ToUpper
- Dim Words() As String = fileContents.Split(" "c, "!"c, "."c, ":"c, """"c, "-"c, Environment.NewLine)
- Dim WordsToReturn(Words.Length - 1) As String
- Dim Index As Integer = 0
- For Each Word As String In Words
- Word = FixWord(Word)
- If Word <> "" Then
- WordsToReturn(Index) = Word
- Index += 1
- End If
- Next
-
- ReDim Preserve WordsToReturn(Index - 1)
- Return WordsToReturn
-
- End Function
-
- ''' <summary>
- ''' Find each file in the given directory and index it.
- ''' </summary>
- ''' <param name="directory">The directory whose files to index.</param>
- ''' <remarks>Files are not recursed, and so only the files in the specified directory itself will be indexed.</remarks>
- Public Sub AddDirectoryToIndex(ByVal directory As String)
-
- ' Get the file names
- For Each Filename As String In IO.Directory.GetFiles(directory)
- AddFileName(Filename)
- Next
-
- ' Phase 1: Build a global vocabulary
- Dim Vocabulary As New SortedList(Of String, Integer)
- For Each File As KeyValuePair(Of Integer, String) In Files
-
- Dim Contents As String = GetFileContents(File.Value)
-
- ' Build a vocabulary for the file and add it to the complete list of words
- Dim FileVocabulary As String() = GetVocabulary(Contents)
- For Each Word As String In FileVocabulary
- If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
- Next
-
- Next
-
- ' Copy the complete list of words into the global vocabulary
- For Each Word As String In Vocabulary.Keys
- AddWordToVocabulary(Word)
- Next
-
- ' Phase 2: Determine how many times each word occurs in each document
- For Each File As KeyValuePair(Of Integer, String) In Files
-
- ' Count each word as it is read in
- Dim Contents As String = GetFileContents(File.Value)
- Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(Contents)
-
- ' Store number of times each word occurs in the document (by merging with the global index)
- MergeDocumentWordCount(FileNumbers(File.Value), DocumentWordCount)
-
- Next
-
- End Sub
-
- Public Sub AddFileToIndex(ByVal fileName As String)
- AddFileContentsToIndex(fileName, GetFileContents(fileName))
- End Sub
-
- Public Sub AddFileToIndex(ByVal virtualFilename As String, ByVal fileName As String)
- AddFileContentsToIndex(virtualFilename, GetFileContents(fileName))
- End Sub
-
- Public Sub AddFileContentsToIndex(ByVal virtualFilename As String, ByVal contents As String)
-
- ' Add to the list of files
- Dim FileIndex As Integer = AddFileName(virtualFilename)
-
- ' Build a vocabulary for the file and add it to the complete list of words
- Dim Vocabulary As New SortedList(Of String, Integer)
- For Each Word As String In GetVocabulary(contents)
- If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
- Next
-
- ' Copy the complete list of words into the global vocabulary
- For Each Word As String In Vocabulary.Keys
- If Not WordNumbers.ContainsKey(Word) Then AddWordToVocabulary(Word)
- Next
-
- ' Count each word as it is read in
- Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(contents)
-
- ' Store number of times each word occurs in the document (by merging with the global index)
- MergeDocumentWordCount(FileIndex, DocumentWordCount)
-
- End Sub
-
- Public Sub RemoveDocumentFromIndex(ByVal virtualFilename As String)
-
- ' Check the file exists
- If Not FileNumbers.ContainsKey(virtualFilename) Then
- Throw New ApplicationException("The document '" + virtualFilename + "' does not exist in the index.")
- End If
-
- ' Remove the file from the list of files
- Dim DocumentIndex As Integer = FileNumbers(virtualFilename)
- Files.Remove(DocumentIndex)
- FileNumbers.Remove(virtualFilename)
-
- ' Process each word in the index
- Dim WordsToRemove As New List(Of Integer)
- For Each Word As KeyValuePair(Of Integer, WordOccurrences) In Index
- Dim WordIndex As Integer = Word.Key
- Dim Occurrences As WordOccurrences = Word.Value
-
- ' Remove the word from the index
- If Occurrences.Documents.ContainsKey(DocumentIndex) Then
-
- ' Remove the document from the word
- Dim OccurrencesInDocument As Integer = Occurrences.Documents(DocumentIndex)
- Occurrences.Occurrences -= OccurrencesInDocument
- Occurrences.Documents.Remove(DocumentIndex)
-
- ' This word no longer exists in any document
- If Occurrences.Documents.Count = 0 Then WordsToRemove.Add(WordIndex)
-
- End If
-
- Next
-
- ' Remove WordOccurrences objects that are no longer needed, as well as unused words in vocabulary
- For Each WordIndex As Integer In WordsToRemove
- Index.Remove(WordIndex)
- WordNumbers.Remove(Words(WordIndex))
- Words.Remove(WordIndex)
- Next
-
- End Sub
-
- ''' <summary>
- ''' Loads all of the text in the given file, and return it.
- ''' </summary>
- ''' <param name="filename">The file whose contents to retrieve.</param>
- Private Function GetFileContents(ByVal filename As String) As String
-
- Dim File As New IO.StreamReader(filename)
- Dim Contents As String = File.ReadToEnd
- File.Close()
- File.Dispose()
- Return Contents
-
- End Function
-
- ''' <summary>
- ''' Strips the words of all non-alphanumeric characters.
- ''' </summary>
- ''' <param name="word">The word to 'fix'.</param>
- ''' <returns>The word without any symbol characters.</returns>
- Private Function FixWord(ByVal word As String) As String
-
- Dim Result As String = ""
- For Each C As Char In word
- Select Case C
- Case "a"c To "z"c, "A"c To "Z"c, "0"c To "9"c : Result += C
- End Select
- Next
- Return Result
-
- End Function
-
- ''' <summary>
- ''' Merges the document words count into the global index.
- ''' </summary>
- ''' <param name="documentFilenameIndex">The index of the document whose word counts to merge.</param>
- ''' <param name="documentWordCount">A list of word indices and word counts.</param>
- Private Sub MergeDocumentWordCount(ByVal documentFilenameIndex As Integer, _
- ByVal documentWordCount As Dictionary(Of Integer, Integer))
-
- For Each WordIndex As Integer In documentWordCount.Keys
- Dim WordOccurrences As Integer = documentWordCount(WordIndex)
- If Index.ContainsKey(WordIndex) Then
- With Index(WordIndex)
- .Occurrences += WordOccurrences
- .Documents.Add(documentFilenameIndex, WordOccurrences)
- End With
- Else
- Index.Add(WordIndex, New WordOccurrences(WordOccurrences, documentFilenameIndex))
- End If
- Next
-
- End Sub
-
- Private Function AddWordToVocabulary(ByVal word As String) As Integer
-
- ' Determine the next word index to use
- Dim WordIndex As Integer = LastAddedWordIndex
- While Words.ContainsKey(WordIndex)
- WordIndex += 1
- End While
- LastAddedWordIndex = WordIndex
-
- ' Add the word to the vocabulary
- Words.Add(WordIndex, word)
- WordNumbers.Add(word, WordIndex)
-
- ' Return the index
- Return WordIndex
-
- End Function
-
- Private Function AddFileName(ByVal fileName As String) As Integer
-
- ' Determine the next file name index to use
- Dim FileNameIndex As Integer = LastAddedFileIndex
- While Files.ContainsKey(FileNameIndex)
- FileNameIndex += 1
- End While
- LastAddedFileIndex = FileNameIndex
-
- ' Add the word to the vocabulary
- Files.Add(FileNameIndex, fileName)
- FileNumbers.Add(fileName, FileNameIndex)
-
- ' Return the index
- Return FileNameIndex
-
- End Function
-
- Public Shared Function LoadIndex(ByVal fileName As String) As Indexer
-
- Dim Result As New Indexer
-
- ' Get list of files
- Using File As New BinaryReader(New FileStream(fileName, FileMode.Open))
-
- Dim FileCount As Integer = File.ReadInt32
- Result.Files.Clear()
- Result.FileNumbers.Clear()
- For I As Integer = 1 To FileCount
- Dim DocumentIndex As Integer = File.ReadInt32
- Dim DocumentName As String = File.ReadString
- Result.Files.Add(DocumentIndex, DocumentName)
- Result.FileNumbers.Add(DocumentName, DocumentIndex)
- Next
-
- ' Get global vocabulary
- Dim VocabularyWordCount As Integer = File.ReadInt32
- Result.Words.Clear()
- Result.WordNumbers.Clear()
- For I As Integer = 1 To VocabularyWordCount
- Dim WordIndex As Integer = File.ReadInt32
- Dim Word As String = File.ReadString
- Result.Words.Add(WordIndex, Word)
- Result.WordNumbers.Add(Word, WordIndex)
- Next
-
- ' Get the main part of the index
- Dim IndexEntriesCount As Integer = File.ReadInt32
- Result.Index.Clear()
- For A As Integer = 1 To IndexEntriesCount
- Dim MainKey As Integer = File.ReadInt32
- Dim Occurrences As Integer = File.ReadInt32
- Dim DocumentCount As Integer = File.ReadInt32
- Dim Documents(DocumentCount - 1) As Integer
- For B As Integer = 0 To DocumentCount - 1
- Documents(B) = File.ReadInt32
- Next
- Result.Index.Add(MainKey, New WordOccurrences(Occurrences, Documents))
- Next
-
- ' Close the file
- File.Close()
-
- End Using
-
- Return Result
-
- End Function
-
- Public Sub Save(ByVal fileName As String)
-
- ' Write list of files
- Dim File As New BinaryWriter(New FileStream(fileName, FileMode.Create))
- File.Write(Files.Count)
- For Each IndexedFilename As KeyValuePair(Of Integer, String) In Files
- File.Write(IndexedFilename.Key)
- File.Write(IndexedFilename.Value)
- Next
-
- ' Write global vocabulary
- File.Write(Words.Count)
- For Each Word As KeyValuePair(Of Integer, String) In Words
- File.Write(Word.Key)
- File.Write(Word.Value)
- Next
-
- ' Write the main part of the index
- File.Write(Index.Count)
- For Each IndexEntry As KeyValuePair(Of Integer, WordOccurrences) In Index
- File.Write(IndexEntry.Key)
- File.Write(IndexEntry.Value.Occurrences)
- File.Write(IndexEntry.Value.Documents.Count)
- For Each DocumentEntry As KeyValuePair(Of Integer, Integer) In IndexEntry.Value.Documents
- File.Write(DocumentEntry.Key)
- Next
- Next
-
- ' Close the file
- File.Close()
-
- End Sub
-
- ''' <summary>
- ''' Returns a string describing the index in full.
- ''' </summary>
- ''' <remarks>
- ''' The index is returned in the following format:
- '''
- ''' WORD: TotalOccurrances {DocumentFilename|DocumentOccurrances,...}
- ''' </remarks>
- Overrides Function ToString() As String
-
- ' Write out every word...
- Dim Result As New Text.StringBuilder
- For Each WordIndex As Integer In Index.Keys
- With Index(WordIndex)
- Result.Append(Words(WordIndex)).Append(": ").Append(.Occurrences).Append(" {")
-
- ' ...and every occurrance of the word...
- Dim CommaRequired As Boolean = False
- For Each DocumentIndex As Integer In .Documents.Keys
- Dim DocumentFilename As String = Files(DocumentIndex)
- Dim DocumentOccurrances As Integer = .Documents(DocumentIndex)
- If CommaRequired Then Result.Append(",") Else CommaRequired = True
- Result.Append(DocumentFilename).Append("|").Append(DocumentOccurrances.ToString)
- Next
- End With
- Result.Append("}").Append(Environment.NewLine)
- Next
- Return Result.ToString
-
- End Function
-
- End Class