http://fukushimu.blog.shinobi.jp/Entry/76/ を参考に
$ wget http://download.wikimedia.org/jawiki/latest/jawiki-latest-all-titles-in-ns0.gz
$ perl conv.pl
$ ls
$ conv.pl jawiki-latest-all-titles-in-ns0 wikipedia.csv
$ /usr/local/libexec/mecab/mecab-dict-index -d /usr/local/lib/mecab/dic/ipadic -u wikipedia.dic -f utf8 -t utf8 wikipedia.csv
複数のユーザ辞書を追加するには、”:”で区切る
$ vi /usr/local/lib/mecab/dic/ipadic/dicrc
;userdic
userdic = /home/foo/bar/foo.dic:/home/foo/bar2/usr.dic:/home/foo/bar3/bar.dic
conv.pl
http://fukushimu.blog.shinobi.jp/Entry/76/ を参考に作成する
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
binmode(STDOUT, ":utf8");
use encoding 'utf8';
my $file1 = "jawiki-latest-all-titles-in-ns0";
my $file2 = "wikipedia.csv";
open(IN, "$file1");
open(OUT, ">$file2");
binmode OUT, ":utf8"; ## <- こっちが正しい
for(<IN>) {
chomp($_);
print $_."\n";
## いらない単語をとばす
next if $_ =~ /^\./;
next if $_ =~ /(曖昧さの回避)/;
next if $_ =~ /^[0-9]+$/;
# next if $_ =~ /[0-9]{4}./;
if (length($_) > 3) {
print OUT "$_,0,0,".max(-36000,-400 * (length^1.5)).",名詞,固有名詞,*,*,*,*,$_,*,*,wikipedia_word,\n";
}
}
sub max {
my $comp = shift @_;
my $val = shift @_;
my $max = $comp;
if ( $comp <= $val ) {
$max = $val;
}
return int($max);
}