use strict;
use warnings;
use utf8;
use Encode qw(decode encode);
use Text::WagnerFischer qw();
use DBI;
 
my $http_pat = qr/s?https?:\/\/[-_.!~*'()a-zA-Z0-9;\/?:\@&=+\$,%#]+/;
 
sub db_connect {
	return DBI->connect('DBI:mysql:foo:bar', 'foo', 'bar') or die "can't connect";
}
 
sub getdata {
	my $id = shift;
	my $dbh = db_connect();
 
	my $sth = $dbh->prepare("select id, content from data");
	my $rv = $sth->execute();
	my %x = ();
	while (my @data = $sth->fetchrow_array()) {
		$x{$data[0]} = decode("eucjp", $data[1]);
	}
 
	$sth->finish;
	$dbh->disconnect;
	return %x;
}
 
sub strip {
	my $x = shift;
	$x = strip_html($x) if $x;
	$x = strip_ex_url($x) if $x;
	$x = strip_cr($x)if $x;
	return $x;
}
 
sub strip_html {
	my $x = shift;
	$x =~ s/<.+?>//sg;
 	return $x;
}
 
sub strip_ex_url {
	my $x = shift;
	my $ret = '';
	for my $y (split /\n/, $x) {
		if (my @http = $y =~ m/$http_pat/g) {
			$ret .= $http[0] . "\n";
		} else {
			$ret .= $y ."\n";
		}
	}
	return $ret;
}
 
sub strip_cr {
	my $x = shift;
	$x =~ s/[\n 	　]//g;
	return $x;
}
 
my %data = getdata(); #DBから読んでくる
my %group = ();
# 言及しているURLでグルーピング
while (my ($key, $value) = each %data) {
		if (my($http,) = ($value =~ m/$http_pat/g)) {
			$group{$http} = [] unless exists $group{$http};
			push @{$group{$http}}, $key;
		}
}
 
my $threshold = 0.2;
while (my($key, $value) = each %group) {
	if (@$value == 2) { # 似ている可能性がある記事が2個のときだけ比較
		my $x1 = $data{$value->[0]};
		my $x2 = $data{$value->[1]};
		my $distance = Text::WagnerFischer::distance($x1, $x2);
		if ($distance / length($x1) > $threshold && $distance / length($x2) > $threshold) {
			print "$value->[0] resemble $value->[1]\n";
		}
	}
}

とりあえず気付いたのは、単語の言い換えにものすごく弱い。文字コード・エンコード・文字セット・文字集合の類とか、エンコードと符号化、Canonとキヤノン、Linuxとリナックスとか。ドラッグアンドドロップとDnDとか。

編集距離では普通の人とルー大柴は比較できないということだな。

これで何をするかと言うと、こういう画面があって（左ツリー右テキスト）重複チェッカー画面、左でアイテムをクリックするとその内容が右に出て、重複してるなーと思ったら削除とかやってたんだけど、微妙な差のが多数あったので、カッとなってある程度自動で取捨選択したくなったんよ。少なくとも件数は減ったよ。気付いたけど、割合出すとこ間違ってたよ……。直したよ……。

[ツッコミを入れる]