#!/usr/bin/perl
#
my $usage = "Usage: convert.pl \n";
my @states; # state transition probabilities
my @matches; # match emission probabilities
my @inserts; # insert emission probabilities
my $M; # number of positions (match nodes) in the model
##
## read in a SAM 3.0 ASCII file
##
sub readBlock
{
local *HANDLE = shift; # file handle to read from
my $filename = shift; # name of the file
my $cols = shift; # number of columns to read
my $rows = shift; # number of rows to read
my $array = []; # array to return
my $i, @a;
for $i (1 .. $rows)
{
$_ = ;
@a = split /\s+/;
die "Parsing error on line $. of $filename!\n$usage\n..."
unless @a == $cols;
push @{$array}, @a;
};
return $array;
};
sub readSAM
{
my $filename = shift; # name of the file to parse
my $position = -2; # current position in the model
my $last = 0; # is this the last position?
my $array=[], @a, $i;
@states=();
@matches=();
@inserts=();
open( SAM, "$filename" )
or die "Cannot open $filename: $!\n$usage\n...";
while()
{
next unless $position >=0 or /^Begin/;
if( /^Begin/ ) {
$position=0;
} elsif ( /^End/ ) {
$position++;
$last=1;
} elsif( /^\s*(\S+)/ ) {
$position++;
die "Parsing error on line $. of $filename!\n..."
unless $1 == $position;
} else {
die "Parsing error on line $. of $filename!\n...";
};
push @states, readBlock( *SAM, $filename, 3, 3 );
push @matches, readBlock( *SAM, $filename, 5, 4 );
push @inserts, readBlock( *SAM, $filename, 5, 4 );
last if $last;
};
close SAM;
$M=$position-1;
};
##
## write a SAM 3.0 ASCII file
##
sub pad
{
my $len = shift; # pad $text so that the final length is $len
my $text = shift; # the text to pad
return " "x($len-length $text) . $text;
};
sub printBlock
{
local *HANDLE = shift; # file handle to write to
my $array = shift; # the array to write
my $cols = shift; # ... number of columns
my $rows = shift; # ... number of rows
my $i, $j;
for $i (0 .. $rows-1) {
for $j (0 .. $cols-1) {
printf HANDLE "%1.6f ", $array->[$i*$cols+$j];
};
print HANDLE "\n";
};
};
sub writeSAM
{
my $filename = shift; # file name to write to
my $position; # current position in the model
open( SAM, ">$filename" ) or die "Cannot open $filename: $!\n...";
print SAM "MODEL\nalphabet protein\n";
for $position (0 .. $M+1)
{
if( $position == 0 ) {
print SAM "Begin\n";
} elsif ( $position == $M+1 ) {
print SAM "End\n";
} else {
print SAM pad(4,$position)."\n";
};
printBlock(*SAM, $states[$position], 3, 3);
printBlock(*SAM, $matches[$position], 5, 4);
printBlock(*SAM, $inserts[$position], 5, 4);
};
print SAM "\nENDMODEL\n";
close SAM;
};
##
## read a HMMER save file
##
# the NULE line
my @NULE = ( 595, -1558, 85, 338, -294, 453, -1158, 197, 249, 902,
-1085, -142, -21, -313, 45, 531, 201, 384, -1998, -644 );
sub score
{
my $prob = shift; # convert this probability to a score
my $nullProb = shift; # given this null probability
return $prob != 0 ? int( 0.5+1000*log($prob/$nullProb)/log(2.0) ) : "*";
};
sub prob
{
my $score = shift; # convert this score to a probability
my $nullProb = shift; # given this null probability
return $score=~ /\*/ ? 0 : $nullProb*exp(log(2)*$score/1000);
};
sub readLine
{
local *HANDLE = shift; # file handle to read from
my $filename = shift; # name of the file
my $number = shift; # number of data expected on the line
my @array;
$_ = ;
@array = split /\s+/;
shift @array;
die "Parsing error on line $. of $filename\n..."
unless @array == $number;
return @array;
};
sub readHMMER
{
my $filename = shift; # name of the file to read from
# transition probabilities: $xy = P(X->Y)
my $dd, $dm, $md, $mm, $mi, $im, $ii, $bm, $me, $bd;
my $position; # current position in the model
my $i, @a, $total;
@states=();
@matches=();
@inserts=();
open( HMMER, "$filename" )
or die "Cannot open $filename: $!\n$usage\n...";
##
## header
##
while()
{
if( /^LENG\s\s(\d+)/ )
{
$M=$1;
# "allocate" @states [is this necessary?]
for $i (0 .. $M+1)
{
push @states, [0,0,0, 0,0,0, 0,0,0];
};
# the "Begin" position values
push @matches, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ];
push @inserts, [ 0.077850, 0.016640, 0.057325, 0.065451, 0.039844,
0.062177, 0.024314, 0.060479, 0.064418, 0.082109,
0.024547, 0.047302, 0.040276, 0.041960, 0.048771,
0.067948, 0.059895, 0.072042, 0.011740, 0.034912 ];
}
elsif( /^HMM\s/ )
{
;
last;
};
};
##
## "3-line"
##
@a = readLine( *HMMER, $filename, 3 );
$bd = prob( $a[2], 1 );
##
## data
##
for $position (1 .. $M)
{
##
## match emissions
##
@a = readLine( *HMMER, $filename, 22 );
shift @a; # what's this last number ??!
pop @a;
for $i (0 .. 19) { $a[$i] = prob( $a[$i], prob($NULE[$i],1/20) ); };
push @matches, [@a];
##
## insert emissions
##
@a = readLine( *HMMER, $filename, 21 );
shift @a;
for $i (0 .. 19) { $a[$i] = prob( $a[$i], prob($NULE[$i],1/20) ); };
push @inserts, [@a];
##
## state transitions
##
@a = readLine( *HMMER, $filename, 10 );
shift @a;
for $i (0 .. 8) { $a[$i] = prob( $a[$i], 1 ); };
($mm, $mi, $md, $im, $ii, $dm, $dd, $bm, $me) = @a;
if( $position == $M )
{
$dd=0; $md=0;
$dm=1; $mm=1; $im=1;
$mi=0; $ii=0;
}
elsif ( $position == 1 )
{
$total = $bm + $bd;
$bm /= $total;
$bd /= $total;
$states[1]->[4] = $bm;
$states[1]->[1] = $bd;
$states[1]->[5] = 1;
};
# renormalisation
$total = $mi + $md + $mm;
if( $total ) {
$mi /= $total;
$md /= $total;
$mm /= $total;
};
$total = $dd + $dm;
if( $total ) {
$dd /= $total;
$dm /= $total;
};
$total = $ii + $im;
if( $total ) {
$ii /= $total;
$im /= $total;
};
# save the probabilities
$states[$position+1]->[4] = $mm;
$states[$position]->[7] = $mi;
$states[$position+1]->[1] = $md;
$states[$position+1]->[5] = $im;
$states[$position]->[8] = $ii;
$states[$position+1]->[3] = $dm;
$states[$position+1]->[0] = $dd;
};
close HMMER;
##
## the "End" + last position values
##
push @matches, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ];
pop @inserts;
push @inserts, [ 0.077850, 0.016640, 0.057325, 0.065451, 0.039844,
0.062177, 0.024314, 0.060479, 0.064418, 0.082109,
0.024547, 0.047302, 0.040276, 0.041960, 0.048771,
0.067948, 0.059895, 0.072042, 0.011740, 0.034912 ];
push @inserts, [ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0 ];
};
##
## write a HMMER save file
##
my $header =
"ALPH Amino\n".
"XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 \n".
"NULT -4 -8455\n".
"NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 \n".
"HMM A C D E F G H I K L M N P Q R S T V W Y \n".
" m->m m->i m->d i->m i->i d->m d->d b->m m->e\n";
sub printLine
{
local *HANDLE = shift; # file handle to print to
print HANDLE pad(6, shift);
while( @_ ) { print HANDLE pad(7, shift); };
print HANDLE "\n";
};
sub writeHMMER
{
my $filename = shift; # file name to read from
# transition probabilities: $xy = P(X->Y)
my $dd, $dm, $md, $mm, $mi, $im, $ii, $bm, $me, $bd, $denom, $total;
my $i, $j, @a;
open( HMMER, ">$filename" ) or die "Cannot open $filename: $!\n...";
##
## header
##
print HMMER "HMMER2.0\n";
print HMMER "NAME $filename\n";
print HMMER "LENG $M\n";
print HMMER $header;
##
## B->D1 line
##
$bd = $states[1]->[1];
$denom = $bd + 2*$states[1]->[4];
printLine( *HMMER, "", score(1-$bd/$denom,1), "*", score($bd/$denom,1) );
for $i (1 ... $M)
{
##
## match emissions
##
@a=();
for $j (0 .. 19) {
push @a, score( $matches[$i]->[$j], prob($NULE[$j],1/20) );
};
printLine( *HMMER, $i, @a, $i );
##
## insert emissions
##
@a=();
for $j (0 .. 19) {
push @a, $i==$M ? "*" : score( $inserts[$i]->[$j], prob($NULE[$j],1/20) );
};
printLine( *HMMER, "-", @a );
##
## state transitions
##
if( $i == $M )
{
$mm=$mi=$md=$im=$ii=$dm=$dd=0;
$me=1;
}
else
{
$mm = $states[$i+1]->[4];
$mi = $states[$i]->[7];
$md = $states[$i+1]->[1];
$im = $states[$i+1]->[5];
$ii = $states[$i]->[8];
$dm = $states[$i+1]->[3];
$dd = $states[$i+1]->[0];
$me = 1/(2*($M-1)-$i);
# renormalization
$total = $dd + $dm;
$dd /= $total;
$dm /= $total;
$total = $ii + $im;
$id /= $total;
$im /= $total;
$total = $me + $mi + $md + $mm;
$me /= $total;
$mi /= $total;
$md /= $total;
$mm /= $total;
};
$bm = ($i==1) ? $states[1]->[4] : $states[1]->[4]/($M-1);
$bm = $bm / $denom;
printLine( *HMMER, "-",
score($mm, 1), score($mi, 1), score($md, 1),
score($im, 1), score($ii, 1), score($dm, 1),
score($dd, 1), score($bm, 1), score($me, 1) );
};
print HMMER "//\n";
close HMMER;
};
##
## main program
##
my $file = $ARGV[0];
die "$usage\n" unless @ARGV>=1;
if( $file =~ s/.mod$// )
{
# file name ends in .mod => assuming a SAM file
unless( $file =~ s/\.asc$// )
{
# does not end in .asc.mod => assuming a binary file
# => create the ascii .asc.mod file as well
`hmmconvert $file.asc -model_file $file.mod -binary_output 0`;
wait;
};
readSAM("$file.asc.mod");
writeHMMER("$file.con.hmm");
}
elsif( $file =~ s/\.hmm$// )
{
# file name ends in .hmm => assuming a HMMER file
readHMMER("$file.hmm");
writeSAM("$file.con.asc.mod");
# create the binary .mod file as well
`hmmconvert $file.con -model_file $file.con.asc.mod -binary_output 1`;
wait;
}
else
{
die "The input file must be either *.mod, *.asc.mod or *.hmm!\n$usage\n...";
};