#!/usr/bin/perl

# Copyright 2018 Dmitry Suplatov
#

use strict;
use warnings;
use File::Spec;
use File::Basename;
use List::Util qw (min max);
 
#Check if the script has been launched with exactly one command-line argument 
print "\n";
print "MUTIPROT v. 1.1 2019-05-08\n";
print "by Dmitry Suplatov\n";    
print "\n";

if (@ARGV<4) {    
    print "This script can merge sequence and structural data describing individual chains of multichain proteins ".
          "for further bioinformatic analysis of the complete protein\n";
    print "\n";
    print "This script can run in two modes: to process structural data (pdb) or sequence data (aln)\n";
    print "Usage: $0 pdb <input.pdb> <chainID> <chainID> <optional=chainID> <optional=chainID> ... <output.pdb> \n";
    print "     <input.pdb>  - Path to the input PDB structure of a multichain protein\n";
    print "     <chainID>    - Provide at least two IDs of protein chains to merge\n";
    print "     <output.pdb> - Path to the write the output PDB structure after processing\n";    
    print "Usage: $0 aln <input.fasta> <input.fasta> <optinal=input.fasta> <optinal=input.fasta> ... <output.fasta>\n";
    print "     <input.fasta>    - Provide at least two alignment corresponding to different protein chains to merge\n";
    print "     <output.fasta>   - Path to the write the output alignment after processing\n";    
    print "\n";
    
    if (@ARGV > 0) {
        print "Error: Insufficient input has been provided\n";
        print "\n";
    }
    exit 0;
}

my $MODE = $ARGV[0];
my $LOG = "multiprot_$MODE.log";

if (-f $LOG) {
    print "Error: The output log file $LOG already exists\n";
    print "\n";
    exit 1;
}

#------------------------------------------------------------------------------------------------------------------------

if ($MODE eq "pdb") {
    
    if (@ARGV < 5) {
        print "Error: Insufficient input has been provided\n";
        print "\n";
        exit 1;
    }
    
    my $PDB_IN = $ARGV[1];
    my @CHAINS=();
    my %CHAINS_HASH=();
    foreach my $i (2 .. ($#ARGV-1)) {
        push(@CHAINS, $ARGV[$i]);
        
        if ($CHAINS_HASH{$ARGV[$i]}) {
            print "Error: ChainID $ARGV[$i] has been provided more than once in the input command\n";
            print "\n";
            exit 1;
        }
        
        $CHAINS_HASH{$ARGV[$i]}=1;
    }
    my $PDB_OUT = $ARGV[$#ARGV];
    print "Input: INPUT PDB       $PDB_IN\n";
    print "Input: INPUT CHAINS    [@CHAINS]\n";
    print "Input: OUTPUT PDB      $PDB_OUT\n";
    print "Input: OUTPUT CHAIN_ID $CHAINS[0]\n";
    print "\n";
    
    if (-f $PDB_OUT) {
        print "Error: The output file $PDB_OUT already exists\n";
        print "\n";
        exit 1;
    }
        
    print "Info: Reading the input PDB file $PDB_IN\n";
    my $pdb = readpdb($PDB_IN);
    print "Info: Renaming chains [@CHAINS] to chain $CHAINS[0] and updating the numbering\n";
    
    open(LOG, ">$LOG") or die "Error: Failed to open log file $LOG for writing\n\n";
    print LOG "#Input PDB:  $PDB_IN\n";
    print LOG "#Chains to merge: [@CHAINS]\n";
    print LOG "#The ID of the merged chain: $CHAINS[0]\n";
    print LOG "#Output PDB: $PDB_OUT\n";
    print LOG "#The correspondence table between the input and output residue numbering is provided below\n";
    print LOG "#\n";
    print LOG "#Input ResName ChainID resID -> Output ResName ChainID resID\n";
    my $lastnum=-999999;
    my $ITERATOR=0;
    my %chain_info = ();
    my @chains_all_inputorder = ();
    
    foreach my $id (sort {$a <=> $b} keys %{$pdb}) {
        my $chain = $pdb->{$id}->{"chainid"};
        my $resid = $pdb->{$id}->{"resn"};
        my $resname = $pdb->{$id}->{"res"};
        
        if (!$chain_info{$chain}) {push(@chains_all_inputorder, $chain);}
        
        $chain_info{$chain}++;
        
        if ($CHAINS_HASH{$chain}) {
            
            my $log=0;
            if ($lastnum == -999999 || $lastnum != $resid) {
                $ITERATOR++;
                $log=1;
            }
            
            $lastnum=$resid;
            $pdb->{$id}->{"resn"}=$ITERATOR;
            
            if ($chain ne $CHAINS[0]) {
                $pdb->{$id}->{"chainid"}=$CHAINS[0];
            }
            
            if ($log) {print LOG sprintf("%3s %1s %4s -> %3s %1s %4s\n",$resname, $chain, $resid, $resname, $pdb->{$id}->{"chainid"}, $pdb->{$id}->{"resn"});}        
        }
    }
    print LOG "#EOF\n";
    close(LOG);
    
    foreach my $chain (@CHAINS) {
        if (!$chain_info{$chain}) {
            print "Error: There is no chain $chain in the input PDB file\n";
            print "\n";
            exit 1;
        }
    }
    
    foreach my $chain (@chains_all_inputorder) {
        print "Info: Found ".($chain_info{$chain}-1)." atoms in chain $chain";
        if ($CHAINS_HASH{$chain}) {
            print " -> converted to chain $CHAINS[0]";
        }
        print "\n";
    }
    
    print "Info: Writing the processed PDB to $PDB_OUT\n";
    printpdb($pdb, $PDB_OUT);
    if (! -f $PDB_OUT || ! -s $PDB_OUT) {
        print "Error: Failed to write the output file\n";
        print "\n";
        exit 1;
    }
    
    print "Info: The correspondence table between the input and output residue numbering has been printed to $LOG\n";    
}

#------------------------------------------------------------------------------------------------------------------------

elsif ($MODE eq "aln") {

    if (@ARGV < 4) {
        print "Error: Insufficient input has been provided\n";
        print "\n";
        exit 1;
    }
    
    my @ALNS=();
    my %ALNS_HASH=();
    foreach my $i (1 .. ($#ARGV-1)) {
        push(@ALNS, $ARGV[$i]);
        $ALNS_HASH{$ARGV[$i]}++;
    }
    
    my $ALN_OUT = $ARGV[$#ARGV];
    print "Input: INPUT FILE SEQUENCE CONTAINS ".($#ALNS+1)." ENTRIES\n";
    foreach my $i (0..$#ALNS) {
        print "Input: INPUT ALIGNMENT #".($i+1)." $ALNS[$i] ";
        if ($ALNS_HASH{$ALNS[$i]} > 1) {
            print " ! (used $ALNS_HASH{$ALNS[$i]} times)";
        }
        print "\n";
    }
    print "Input: OUTPUT ALIGNMENT $ALN_OUT\n";
    print "\n";
    
    if (-f $ALN_OUT) {
        print "Error: The output file $ALN_OUT already exists\n";
        print "\n";
        exit 1;
    }

    print "Info: Reading the input multiple alignment files\n";
    my %data_hash = ();
    my %common_seq = ();
    my $file_count = 0;
    foreach my $aln_file (@ALNS) {
        if ($data_hash{$aln_file}) {next;}        
        $file_count++;        
        my $aln = readaln($aln_file);
        $data_hash{$aln_file} = $aln;
        my @sequences = keys %{$aln};
        print "Info: Alignment file $aln_file contains ".($#sequences+1)." sequences and ".($#{$aln->{$sequences[0]}}+1)." columns\n";
        foreach my $seq (@sequences) {
            $common_seq{$seq}++;   
        }
    }
    
    print "Info: Looking for common sequences which are present in all input alignment files\n";
    my $seqlistidenticalinallfiles=1;
    foreach my $seq (keys %common_seq) {
        if ($common_seq{$seq} < $file_count) {
            delete($common_seq{$seq});
            $seqlistidenticalinallfiles=0;
        } elsif ($common_seq{$seq} > $file_count) {
            print "Error: Sequence $seq has been observed $common_seq{$seq} times in $file_count files\n";
            print "\n";
            exit 1;
        }
    }
    my $total_seq_expected = (scalar keys %common_seq);
    my $total_columns_expect = 0;
    
    if ($total_seq_expected == 0) {
        print "Error: There are no proteins which are present in all input alignment files\n";
        print "Error: Merge of alignment files is not possible, i.e., there is nothing to merge\n";
        print "Error: The input files must contain alignments of parts of the same proteins, i.e., chains A and B\n";
        print "\n";
        exit 1;
    }
    
    if ($seqlistidenticalinallfiles) {
        print "Info: All input alignment files contain a common list of $total_seq_expected sequences\n";
        #Calculate the expected number of columns of the merged alignment based on the current length of input alignments
        foreach my $aln_file (sort keys %ALNS_HASH) {
            my $aln = $data_hash{$aln_file};
            my @sequences = keys %{$aln};
            $total_columns_expect += (($#{$aln->{$sequences[0]}}+1) * $ALNS_HASH{$aln_file});
        }
    } else {
        print "Info: There are $total_seq_expected sequences which are present in all input alignment files\n";                
        print "Info: Cleaning alignment data from sequences which are not present in all input alignment files\n";
        foreach my $aln_file (sort keys %ALNS_HASH) {
            
            print "Info: Cleaning the alignment $aln_file\n";
            my $aln = $data_hash{$aln_file};
            
            #First delete all rows of an alignment corresponding to non-common sequences
            foreach my $seqname (keys %{$aln}) {
                if (!$common_seq{$seqname}) {
                    delete($aln->{$seqname});
                }
            }
            
            #Calculate the gap occurence in every remaining column
            my $ncolumns = -1;
            my %column_gap_counter=();
            foreach my $seqname (keys %{$aln}) {
                if ($ncolumns == -1) {
                    $ncolumns = $#{$aln->{$seqname}} + 1;
                    foreach my $i (0..($ncolumns-1)) {
                        $column_gap_counter{$i} = 0;
                    }
                } elsif ($ncolumns != $#{$aln->{$seqname}} + 1) {
                    print "Error: The number of columns for sequence $seqname is inconsistent with the previous sequence\n";
                    print "\n";
                    exit 1;
                }
                
                foreach my $i (0..$#{$aln->{$seqname}}) {
                    my $aa = $aln->{$seqname}->[$i];
                    if ($aa eq '-') {
                        $column_gap_counter{$i}++;
                    }                
                }
            }
            
            #Identify the gaps-only columns
            my %columns_gapped=();
            my @sequences = keys %{$aln};        
            my $nseq = scalar @sequences;
            foreach my $i (0..$ncolumns-1) {
                if ($column_gap_counter{$i} == $nseq) {
                    $columns_gapped{$i}=1;
                }
            }
            
            #Remove the gaps-only columns
            foreach my $seqname (keys %{$aln}) {
                my @arr = ();
                foreach my $i (0..$#{$aln->{$seqname}}) {
                    push (@arr, $aln->{$seqname}->[$i]) unless $columns_gapped{$i};
                }
                $aln->{$seqname} = \@arr;
            }
            print "Info: Removed ".(scalar keys %columns_gapped)." gaps-only columns in $aln_file after clean-up\n";
            
            #Finally, increment the total_columns_expect counter by the length of this alignment multiplied by the number of times it will be merged                
            $total_columns_expect += (($#{$aln->{$sequences[0]}}+1) * $ALNS_HASH{$aln_file});
        }
    }

    print "Info: Merging alignments of common sequences in the requested order and printing the final alignment to $ALN_OUT\n";
    open(FO, ">$ALN_OUT") or die "Error: Failed to open file $ALN_OUT for writing\n";
    my $total_seq_fact = 0;
    my $total_columns_fact=0;
    foreach my $seqname (keys %{$data_hash{$ALNS[0]}}) {
        print FO ">$seqname\n";        
        my $c = 0;
        $total_columns_fact=0;
        foreach my $aln_file_id (0..$#ALNS) {
            my $aln_file = $ALNS[$aln_file_id];
            my $aln = $data_hash{$aln_file};
            foreach my $i (0..$#{$aln->{$seqname}}) {
                print FO $aln->{$seqname}->[$i];
                $c++;
                $total_columns_fact++;
                if ($c % 60 == 0) {
                    print FO "\n";
                    $c=0;
                }
            }
        }
        if ($c != 0) {
            print FO "\n";
        }
        print FO "\n";
        
        $total_seq_fact++;
        if ($total_columns_fact != $total_columns_expect) {
            print "Error: The number of columns for protein $seqname ($total_columns_fact) is different from what has been expected ($total_columns_expect)\n";
            print "\n";
            exit 1;
        }
    }
    
    print "Info: The merged alignment contains ".$total_seq_fact." sequences and ".$total_columns_fact." columns\n";
    
    if ( $total_seq_fact != $total_seq_expected ) {
        print "Error: The output does not match expectations (i.e, $total_seq_expected sequences were expected)\n";
        print "\n";
        exit 1
    }
}

#------------------------------------------------------------------------------------------------------------------------

else {
    print "Error: Unknown mode $MODE\n";
    exit 0;
}

print "\n";
print "Done!\n";

#===================================================================
# SUBROUTINES
#===================================================================

#Read in a pdb file
sub readpdb {
    my $input = shift;
    
    my %pdb = ();
    open(FI, $input) or die "Error: Can not open file $input for reading\n\n";
    
    #Read in the PDB file
    
    while(<FI>)
    {
        chomp;                
        
        #Remove Windows ^M sign
        s/\r//g;
        
        my $keyword = substr($_,0,6);    # look for ATOM record
        if ($keyword eq "ATOM  " || $keyword eq "HETATM") {
        
            my $length = length $_;
            if ($length < 54) {                
                print "\n$_\n";
                print "Error: ATOM and HETATM fields must be at least 54 characters in length (found $length))\n";
                pdbFormatError();
            }
            
            my $num = substr($_,6,5);
            $num =~ s/\s+//g;
            while ($pdb{$num}) {
                $num++;
            }

            $pdb{$num}{"head"} = substr($_,0,6);
            $pdb{$num}{"atom"} = substr($_,12,4);
            my $type = $pdb{$num}{"atom"};
            $type=substr($type,0,1);
            $pdb{$num}{"type"} = $type;            
            $pdb{$num}{"res"} = substr($_,16,4);            
            $pdb{$num}{"chainid"} = substr($_,21,1);
            $pdb{$num}{"resn"} = substr($_,22,4);            
            $pdb{$num}{"xcor"} = substr($_,30,8);
            $pdb{$num}{"ycor"} = substr($_,38,8);
            $pdb{$num}{"zcor"} = substr($_,46,8);
            #$pdb{$num}{"occ"} = substr($_,54,6);
            #$pdb{$num}{"bfac"} = substr($_,60,6);
            #$pdb{$num}{"segid"} = substr($_,72,4);
            #$pdb{$num}{"elm"} = substr($_,76,2);
            #$pdb{$num}{"charge"} = substr($_,78,2);
            
            #Remove spaces
            foreach my $key (keys %{$pdb{$num}}) {
                $pdb{$num}{$key} =~ s/\s+//g;
            }
        }
    }
    close(FI);
    return \%pdb;
}

sub pdbFormatError{    
    print "Error: The file submitted as PDB is not valid\n\n";    
    exit 1;
}

sub readaln
{
    my $filename = shift;
    open(FI, $filename) or die "Error: Can not open file $filename for reading\n\n";;
    my %hash = ();
    my %hash_check=();
    my $name = "";
    my $seq = "";

    while(<FI>)
    {
        chomp;

        #Remove Windows ^M sign
        s/\r//g;

        if (/^>(.+)/) 
        {
            if (!$name) {$name = $1; next;}
            my $tmp = $1;            
                        
            my @arr=split(//,$seq);
            #if ($hash{$name}) {push (@{$hash{$name}}, $chain);} # Insert chain delemeter after the first chain
            $hash{$name} = \@arr; # _Append_ new lines to the hash
            
            if ($hash_check{$name}) {
                print "Error: Sequence name \"$name\" used twice in one alignment\n";
                fastaFormatError();            
            }
            
            $hash_check{$name}=1;            
            $name = $tmp;
            $seq = "";
            next;
        }    
        #if (!$name) {die "Hard!!! Sequence without a name:\n$_\n";}
                
        #Convert to upper case
        $seq.= uc $_;    
    }
    if ($name) {
        my @arr=split(//,$seq);
        $hash{$name} = \@arr; # _Append_ new lines to the hash
        if ($hash_check{$name}) {
            print "Error: Sequence name \"$name\" used twice in one alignment\n";
            fastaFormatError();            
        }
    }
    else {
        print "Error: Unexpected end-of-file\n";
        fastaFormatError();            
    }
    
    return \%hash;
}

sub fastaFormatError{

    print "Error: The file submitted is not a valid FASTA Alignment \n";
    print "Error: You can read more about the <a href=\"http://en.wikipedia.org/wiki/FASTA_format\">FASTA format</a> and <a href=\"http://en.wikipedia.org/wiki/Multiple_sequence_alignment\">Multiple sequence alignments</a>\n";
    print "Error: Please revise your data and resubmit in a new task\n\n";
    exit 1;
}

#Print a pdb file
sub printpdb {    
    my $pdb = shift;
    my $out = shift;
    
    #Find the last id-index
    my @old_ids = keys %{$pdb};
    my $min = min @old_ids;
    my $max = max @old_ids;
    
    #Now print out the results in the topology atom order
    open(FO, ">$out") or die "ERROR: Can not open file $out for writing\n\n";
    my $new_id = 0;
    foreach my $old_id ($min..$max)
    {
        #We have deleted some entries or they could have been missing in the original file
        #So simply skip such cases
        if (!$pdb->{$old_id}) {
            next;
        }        
        
        $new_id++;
        
        my $ATOM_NAME = sprintf("%-3s", $pdb->{$old_id}{"atom"});
        $ATOM_NAME = sprintf("%5s", $ATOM_NAME);
                
        printf FO "%-6s%5s$ATOM_NAME%4s %1s%4d    %8.3f%8.3f%8.3f\n",#%6.2f%6.2f      %4s%2s%2s\n",
            $pdb->{$old_id}{"head"},
            $new_id,
            #$pdb->{$old_id}{"atom"},
            $pdb->{$old_id}{"res"},
            $pdb->{$old_id}{"chainid"},
            $pdb->{$old_id}{"resn"},
            $pdb->{$old_id}{"xcor"},
            $pdb->{$old_id}{"ycor"},
            $pdb->{$old_id}{"zcor"}; #,
            #$pdb->{$old_id}{"occ"},
            #$pdb->{$old_id}{"bfac"},
            #$pdb->{$old_id}{"segid"},
            #$pdb->{$old_id}{"elm"},
            #$pdb->{$old_id}{"charge"};
    }
    close(FO);
}
