/// <summary> Simple similarity query generators.
/// Takes every unique word and forms a boolean query where all words are optional.
/// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
/// The only caveat is the first hit returned <b>should be</b> your source document - you'll
/// need to then ignore that.
///
/// <p/>
///
/// So, if you have a code fragment like this:
/// <br/>
/// <code>
/// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
/// </code>
///
/// <p/>
///
/// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
///
/// <p/>
/// The philosophy behind this method is "two documents are similar if they share lots of words".
/// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
///
/// <P/>
/// This method is fail-safe in that if a long 'body' is passed in and
/// <see cref="BooleanQuery.Add"/> (used internally)
/// throws
/// <see cref="BooleanQuery.TooManyClauses"/>, the
/// query as it is will be returned.
/// </summary>
/// <param name="body">the body of the document you want to find similar documents to
/// </param>
/// <param name="a">the analyzer to use to parse the body
/// </param>
/// <param name="field">the field you want to search on, probably something like "contents" or "body"
/// </param>
/// <param name="stop">optional set of stop words to ignore
/// </param>
/// <returns> a query with all unique words in 'body'
/// </returns>
/// <throws> IOException this can't happen... </throws>
public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
{
TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
BooleanQuery tmp = new BooleanQuery();
ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups
while (ts.IncrementToken())
{
String word = termAtt.Term;
// ignore opt stop words
if (stop != null && stop.Contains(word))
continue;
// ignore dups
if (already.Contains(word))
continue;
already.Add(word);
// add to query
TermQuery tq = new TermQuery(new Term(field, word));
try
{
tmp.Add(tq, Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}