使用场景:Google 的 simhash 算法
//通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。
//从我的经验,如果我们假定N是每个块的大小,M是重叠的字符的数目,N = 4和M = 3是最好的选择
public class SimHashAnalyser : IAnalyser
{
private const int HashSize = 32;
public float GetLikenessValue(string needle, string haystack)
{
var needleSimHash = this.DoCalculateSimHash(needle);
var hayStackSimHash = this.DoCalculateSimHash(haystack);
return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;
}
private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)
{
var hashedTokens = new List<int>();
foreach (string token in tokens)
{
hashedTokens.Add(token.GetHashCode());
}
return hashedTokens;
}
private static int GetHammingDistance(int firstValue, int secondValue)
{
var hammingBits = firstValue ^ secondValue;
var hammingValue = 0;
for (int i = 0; i < 32; i++)
{
if (IsBitSet(hammingBits, i))
{
hammingValue += 1;
}
}
return hammingValue;
}
private static bool IsBitSet(int b, int pos)
{
return (b & (1 << pos)) != 0;
}
private int DoCalculateSimHash(string input)
{
ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);
var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));
var vector = new int[HashSize];
for (var i = 0; i < HashSize; i++)
{
vector[i] = 0;
}
foreach (var value in hashedtokens)
{
for (var j = 0; j < HashSize; j++)
{
if (IsBitSet(value, j))
{
vector[j] += 1;
}
else
{
vector[j] -= 1;
}
}
}
var fingerprint = 0;
for (var i = 0; i < HashSize; i++)
{
if (vector[i] > 0)
{
fingerprint += 1 << i;
}
}
return fingerprint;
}
}
public interface IAnalyser
{
float GetLikenessValue(string needle, string haystack);
}
public interface ITokeniser
{
IEnumerable<string> Tokenise(string input);
}
public class FixedSizeStringTokeniser : ITokeniser
{
private readonly ushort tokensize = 5;
public FixedSizeStringTokeniser(ushort tokenSize)
{
if (tokenSize < 2 || tokenSize > 127)
{
throw new ArgumentException("Token 不能超出范围");
}
this.tokensize = tokenSize;
}
public IEnumerable<string> Tokenise(string input)
{
var chunks = new List<string>();
int offset = 0;
while (offset < input.Length)
{
chunks.Add(new string(input.Skip(offset).Take(this.tokensize).ToArray()));
offset += this.tokensize;
}
return chunks;
}
}
public class OverlappingStringTokeniser : ITokeniser
{
private readonly ushort chunkSize = 4;
private readonly ushort overlapSize = 3;
public OverlappingStringTokeniser(ushort chunkSize, ushort overlapSize)
{
if (chunkSize <= overlapSize)
{
throw new ArgumentException("Chunck 必须大于 overlap");
}
this.overlapSize = overlapSize;
this.chunkSize = chunkSize;
}
public IEnumerable<string> Tokenise(string input)
{
var result = new List<string>();
int position = 0;
while (position < input.Length - this.chunkSize)
{
result.Add(input.Substring(position, this.chunkSize));
position += this.chunkSize - this.overlapSize;
}
return result;
}
}
使用:
const string HayStack = "中国香港………………";
const string Needle = "中国香港 2013………………";
IAnalyser analyser = new SimHashAnalyser();
var likeness = analyser.GetLikenessValue(Needle, HayStack);
Console.Clear();
Console.WriteLine("Likeness: {0}%", likeness * 100);
Console.ReadKey();
SimHash for c#
版权说明 : 本文为转载文章, 版权归原作者所有 版权申明
原文链接 : https://www.cnblogs.com/zengxiangzhan/p/3311114.html
内容来源于网络,如有侵权,请联系作者删除!