#!/bin/bash
# walk through all files in the current dir (and subdirs)
# and compare them with other files, showing percentage
# of duplication.
# which type files to compare?
# (wouldn't make sense to compare binary formats)
ext="txt"
# support filenames with spaces:
IFS=$(echo -en "\n\b")
working_dir="$PWD"
working_dir_name=$(echo $working_dir | sed 's|.*/||')
all_files="$working_dir/../$working_dir_name-filelist.txt"
remaining_files="$working_dir/../$working_dir_name-remaining.txt"
# get information about files:
find -type f -print0 | xargs -0 stat -c "%s %n" | grep -v "/\." | \
grep "\.$ext" | sort -nr > $all_files
cp $all_files $remaining_files
while read string; do
fileA=$(echo $string | sed 's/.[^.]*\./\./')
tail -n +2 "$remaining_files" > $remaining_files.temp
mv $remaining_files.temp $remaining_files
# remove empty lines since they produce false positives
sed '/^$/d' $fileA > tempA
echo Comparing $fileA with other files...
while read string; do
fileB=$(echo $string | sed 's/.[^.]*\./\./')
sed '/^$/d' $fileB > tempB
A_len=$(cat tempA | wc -l)
B_len=$(cat tempB | wc -l)
differences=$(sdiff -B -s tempA tempB | wc -l)
common=$(expr $A_len - $differences)
percentage=$(echo "100 * $common / $B_len" | bc)
if [[ $percentage -gt 15 ]]; then
echo " $percentage% duplication in" \
"$(echo $fileB | sed 's|\./||')"
fi
done < "$remaining_files"
echo " "
done < "$all_files"
rm tempA
rm tempB
rm $all_files
rm $remaining_files
4条答案
按热度按时间v64noz0r1#
像这样的吗?
两个文件,A1和A2。
$ sdiff -B -b -s A1 A2 | wc
会给予你有多少行不同。wc给出总数,只需除以。-B和-B表示忽略空白和空行,-s表示抑制公共行。
2izufjch2#
https://superuser.com/questions/347560/is-there-a-tool-to-measure-file-difference-percentage对此有一个很好的解决方案,
更多选项请参见man wdiff。
ibrsph3r3#
下面是一个脚本,它将比较所有.txt文件,并显示具有超过15%重复的文件:
e7arh2l64#
下面是一个使用
comm
的快速bash
解决方案。空白行将被忽略。示例结果:
wdiff
和sdiff
解决方案非常好,但这些实用程序通常不会安装在默认环境中。