public float GetDistanceFast(String source, String target) { int sl = source.Length; int tl = target.Length; if (sl == 0 || tl == 0) { if (sl == tl) { return 1; } else { return 0; } } if (sl < n || tl < n) return 0; //char[] sa = new char[sl + 2 * n - 2]; //char[] ta = new char[tl + 2 * n - 2]; //for (int i = 0; i < sa.Length; i++) //{ // if (i < n - 1 || i > sl) // sa[i] = (char)0;//padding // else // sa[i] = source[i - n + 1]; //} //for (int i = 0; i < ta.Length; i++) //{ // if (i < n - 1 || i > tl) // ta[i] = (char)0;//padding // else // ta[i] = target[i - n + 1]; //} HashSet<string> sset = new HashSet<string>(); HashSet<string> tset = new HashSet<string>(); HashSet<string> allset = new HashSet<string>(); for (int i = 0; i < sl + 2 * n - 2 - n + 1; i++) { char[] qgram = new char[n]; for (int j = 0; j < n; j++) { if (i+j < n - 1 || i - n + 1 + j >= sl) qgram[j] = (char)0; else qgram[j] = source[i -n+1 + j]; } sset.Add(new string(qgram)); allset.Add(new string(qgram)); } for (int i = 0; i < tl + 2 * n - 2 - n + 1; i++) { char[] qgram = new char[n]; for (int j = 0; j < n; j++) { if (i+j < n - 1 || i - n + 1 + j >= tl) qgram[j] = (char)0; else qgram[j] = target[i - n + 1 + j]; } tset.Add(new string(qgram)); allset.Add(new string(qgram)); } int matches = 0; foreach (string qgram in allset) { if (sset.Contains(qgram) && tset.Contains(qgram)) matches++; } return (float)matches / (float)(allset.Count); }
public static void TestJaccard() { string fn1 = "stree"; string fn2 = "street"; string fn3 = "steere"; JaccardDistance jd = new JaccardDistance(2); double x = jd.GetDistanceFast(fn1, fn2); double y = jd.GetDistanceFast(fn1, fn3); double z = 1; }