#!/usr/bin/perl -w

use FileHandle;
use Carp;
use Bio::Seq;
use Bio::SeqIO;
use Bio::Tools::SeqWords;
use strict;

if(@ARGV < 2)
  {
    print "Usage: perl calc.nmer.prob.pl fasta_file nmer_size\n";
    exit(1);
  }

my $fastaFile = $ARGV[0];
my $nmerSize = $ARGV[1];

if($nmerSize > 10)
  {
    print "Nmer size must be <= 10\n";
    exit(1);
  }

my $nmers = CountFileWords( $fastaFile, $nmerSize );
my $prob = CalcNmerProbs( $nmers );

foreach my $w (sort keys %{$nmers})
  {
    print "$w\t$nmers->{$w}\t$prob->{$w}\n";
  }


sub CalcNmerProbs
  {
    my ($nmers) = @_;

    my $sum = 0;
    foreach my $w (keys %{$nmers})
      {
	$sum += $nmers->{$w};
      }

    my %prob;
    foreach my $w (keys %{$nmers})
      {
	$prob{$w} = $nmers->{$w}/$sum;
      }

    return \%prob;
  }


sub CountFileWords
  {
    my ($file, $size) = @_;

    my %nmers;

    my $in = Bio::SeqIO->new( -file => $file, -format => 'Fasta' );
    while( my $seq = $in->next_seq() )
      {
	CountWords( $seq, \%nmers, $size );
	CountWords( $seq->revcom(), \%nmers, $size );
      }
    $in->close();

    return \%nmers
  }


sub CountWords
  {
    my ($seq, $nmers, $size) = @_;

    my $seq_word = Bio::Tools::SeqWords->new( -seq => $seq );
    my $words = $seq_word->count_overlap_words( $size );
    foreach my $w (keys %{$words})
      {
	if($w !~ /N/)
	  {
	    $nmers->{$w} += $words->{$w};
	  }
      }
  }


sub trim
  {
    my @out = @_;
    foreach (@out)
      {
	if( $_ )
	  {
	    chomp;
	    s/\r+$//;
	    s/^\s+//;
	    s/\s+$//;
	  }
      }
    return wantarray ? @out : $out[0];
  }


sub OpenFile
  {
    my ($file) = @_;

    my $fh = new FileHandle;
    $fh->open( $file ) or croak "Can't open $file: $!";
    return $fh;
  }
