-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathInvertedIndexCompareHashSetAndList.cs
More file actions
243 lines (191 loc) · 8.36 KB
/
InvertedIndexCompareHashSetAndList.cs
File metadata and controls
243 lines (191 loc) · 8.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
/// This program compares using a HashSet and List to store the postings for a simple inverted index.
namespace InvertedIndexCompareHashSetAndList
{
/// <summary>
/// Template pattern from the inverted index.
/// </summary>
/// <typeparam name="TDocID">Type of the docID</typeparam>
public abstract class InvertedIndexTemplate<TDocID>
{
protected static readonly Regex splitWords = new Regex(@"[A-Za-z]+");
public void Add(string text, TDocID docId)
{
foreach (var word in SplitWords(text))
{
if (!CheckIfWordExistsInIndex(word))
AddWordToIndex(word);
if (CheckDocIDShouldBeAddedToWordPostings(docId, word))
AddDocIDToWordPostings(docId, word);
}
}
protected abstract void AddWordToIndex(string word);
protected abstract bool CheckDocIDShouldBeAddedToWordPostings(TDocID docId, string word);
protected abstract void AddDocIDToWordPostings(TDocID docId, string word);
public IList<TDocID> Search(string keywords)
{
IEnumerable<TDocID> docIDsWithMatches = null;
foreach (var word in SplitWords(keywords))
{
if (!CheckIfWordExistsInIndex(word))
return new List<TDocID>(); // Word does not exist in the index.
if (docIDsWithMatches == null)
docIDsWithMatches = GetPostingsForWord(word);
else
{
docIDsWithMatches = CheckCurrentWordPostingsAgainstEarlierWordPostings(docIDsWithMatches, word);
if (docIDsWithMatches == null)
return new List<TDocID>(); // This word does not exist in the same documents as the rest of the keywords.
}
}
return docIDsWithMatches.ToList();
}
protected abstract IEnumerable<TDocID> CheckCurrentWordPostingsAgainstEarlierWordPostings(IEnumerable<TDocID> docIDsWithMatches, string word);
protected abstract IEnumerable<TDocID> GetPostingsForWord(string word);
protected abstract bool CheckIfWordExistsInIndex(string word);
protected static IEnumerable<string> SplitWords(string text)
{
var words = splitWords.Matches(text);
foreach (Match wordMatch in words)
yield return wordMatch.Value;
}
}
/// <summary>
/// Inverted index implemented with a Dictionary and a HashSet.
/// </summary>
public class HashSetInvertedIndex<TDocID> : InvertedIndexTemplate<TDocID>
{
private readonly Dictionary<string, HashSet<TDocID>> wordToDocIDPostings = new Dictionary<string, HashSet<TDocID>>();
protected override void AddWordToIndex(string word)
{
wordToDocIDPostings[word] = new HashSet<TDocID>();
}
protected override bool CheckDocIDShouldBeAddedToWordPostings(TDocID docId, string word)
{
return !wordToDocIDPostings[word].Contains(docId);
}
protected override void AddDocIDToWordPostings(TDocID docId, string word)
{
wordToDocIDPostings[word].Add(docId);
}
protected override bool CheckIfWordExistsInIndex(string word)
{
return wordToDocIDPostings.ContainsKey(word);
}
protected override IEnumerable<TDocID> GetPostingsForWord(string word)
{
return wordToDocIDPostings[word];
}
protected override IEnumerable<TDocID> CheckCurrentWordPostingsAgainstEarlierWordPostings(IEnumerable<TDocID> docIDsWithMatches, string word)
{
return docIDsWithMatches.Intersect(GetPostingsForWord(word));
}
}
/// <summary>
/// Inverted index implemented with a Dictionary and a list.
/// </summary>
public class ListInvertedIndex<TDocID> : InvertedIndexTemplate<TDocID>
{
private readonly Dictionary<string, List<TDocID>> wordToDocIDPostings = new Dictionary<string, List<TDocID>>();
protected override void AddWordToIndex(string word)
{
wordToDocIDPostings[word] = new List<TDocID>();
}
protected override bool CheckDocIDShouldBeAddedToWordPostings(TDocID docId, string word)
{
return !wordToDocIDPostings[word].Contains(docId);
}
protected override void AddDocIDToWordPostings(TDocID docId, string word)
{
wordToDocIDPostings[word].Add(docId);
}
protected override bool CheckIfWordExistsInIndex(string word)
{
return wordToDocIDPostings.ContainsKey(word);
}
protected override IEnumerable<TDocID> GetPostingsForWord(string word)
{
return wordToDocIDPostings[word];
}
protected override IEnumerable<TDocID> CheckCurrentWordPostingsAgainstEarlierWordPostings(IEnumerable<TDocID> docIDsWithMatches, string word)
{
return docIDsWithMatches.Intersect(GetPostingsForWord(word));
}
}
class Program
{
static void Main(string[] args)
{
var docs = GetDocs();
var keywords = new List<string>() { "a", "the", "war", "the war", "peace", "war and peace", "sea", "air", "land" };
TestLoadingAndSearchingIndex(docs, keywords, new HashSetInvertedIndex<int>(), "HashSetInvertedIndex");
TestLoadingAndSearchingIndex(docs, keywords, new ListInvertedIndex<int>(), "ListInvertedIndex");
Console.ReadKey();
}
private static void TestLoadingAndSearchingIndex(IList<string> docs, IList<string> keywords, InvertedIndexTemplate<int> invertedIndex, string indexTypeName)
{
double currentLoadTime = 0;
double lastLoadTime = 0;
double currentSearchTime = 0;
double lastSearchTime = 0;
for (var i = 0; i <= 5; i++)
{
var stopwatch = new Stopwatch();
stopwatch.Start();
docs.Each((line, index) => { invertedIndex.Add(line, index); });
stopwatch.Stop();
currentLoadTime = stopwatch.Elapsed.TotalMilliseconds;
stopwatch.Reset();
stopwatch.Start();
var searchResults = new List<int>();
foreach (var keyword in keywords)
searchResults.AddRange(invertedIndex.Search(keyword));
stopwatch.Stop();
currentSearchTime = stopwatch.Elapsed.TotalMilliseconds;
Console.WriteLine();
Console.WriteLine("{0} - {1} records, {2} keywords, {3} total results", indexTypeName, docs.Count().ToString(), keywords.Count().ToString(), searchResults.Count().ToString());
Console.WriteLine("Loading - {0} - {1}", currentLoadTime, (lastLoadTime != 0) ? (currentLoadTime / lastLoadTime) : 0);
Console.WriteLine("Searching - {0} - {1}", lastSearchTime, (lastLoadTime != 0) ? (currentSearchTime / lastSearchTime) : 0);
lastLoadTime = currentLoadTime;
lastSearchTime = currentSearchTime;
docs = DoubleDocs(docs);
}
}
private static IList<string> DoubleDocs(IList<string> originalDocs)
{
var docs = new List<string>();
docs.AddRange(originalDocs);
docs.AddRange(originalDocs);
originalDocs = null;
return docs;
}
/// <summary>
/// Simulate getting documents and storing them in strings.
/// </summary>
private static IList<String> GetDocs()
{
const string f = "pg2600.txt";
List<string> lines = new List<string>();
using (StreamReader r = new StreamReader(f))
{
string line;
while ((line = r.ReadLine()) != null)
lines.Add(line);
}
return lines.ToList();
}
}
public static class IEnumerableHelper
{
public static void Each<T>(this IEnumerable<T> ie, Action<T, int> action)
{
var index = 0;
foreach (var e in ie) action(e, index++);
}
}
}