LintCode/Top K Frequent Words

Problem Summary

Find top k frequent words with map reduce framework.

Solution

With map reduce, we only need to implement the mapper and the reducer.

The mapper part is easy to code. Note that there may be more than one consecutive spaces in the input. As for the reducer, we calculate the frequency of each word and push the pair (word,frequency) into a priority queue, i.e. a heap. After the reduce is done, we pop k pairs from the heap.

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
* Definition of Document:
* class Document {
* public:
* int id; // document id
* string content; // document content
* }
*/
class TopKFrequentWordsMapper: public Mapper {
public:
void Map(Input<Document>* input) {
// Write your code here
// Please directly use func 'output' to output
// the results into output buffer.
// void output(string &key, int value);
while (!input->done())
{
string s = input->value().content;
int st = 0;
for (int en = 0; en <= s.length(); ++en)
{
if (en == s.length() || s[en] == ' ')
{
if (en > st)
{
string tmp = s.substr(st,en-st);
output(tmp,1);
}
st = en+1;
}
}
input->next();
}
}
};
class mycomparison
{
public:
bool operator() (const pair<string,int> &a,const pair<string,int> &b)
{
if (a.second < b.second)
return true;
else
if (a.second > b.second)
return false;
else
return (a.first > b.first);
}
};
class TopKFrequentWordsReducer: public Reducer {
public:
int K;
priority_queue<pair<string,int>,vector<pair<string,int>>,mycomparison> qu;
void setUp(int k) {
// initialize your data structure here
K = k;
}
void Reduce(string &key, Input<int>* input) {
int sum = 0;
while (!input->done())
{
sum += input->value();
input->next();
}
qu.push(make_pair(key,sum));
}
void cleanUp() {
// Please directly use func 'output' to output
// the top k pairs <word, times> into output buffer.
// void output(string &key, int &value);
for (int i = 0; i < K && !qu.empty(); ++i)
{
pair<string,int> cur = qu.top();
output(cur.first,cur.second);
qu.pop();
}
}
};