#!/usr/bin/perl # Copyright 2018 Dmitry Suplatov # use strict; use warnings; use File::Spec; use File::Basename; use List::Util qw (min max); #Check if the script has been launched with exactly one command-line argument print "\n"; print "MUTIPROT v. 1.1 2019-05-08\n"; print "by Dmitry Suplatov\n"; print "\n"; if (@ARGV<4) { print "This script can merge sequence and structural data describing individual chains of multichain proteins ". "for further bioinformatic analysis of the complete protein\n"; print "\n"; print "This script can run in two modes: to process structural data (pdb) or sequence data (aln)\n"; print "Usage: $0 pdb ... \n"; print " - Path to the input PDB structure of a multichain protein\n"; print " - Provide at least two IDs of protein chains to merge\n"; print " - Path to the write the output PDB structure after processing\n"; print "Usage: $0 aln ... \n"; print " - Provide at least two alignment corresponding to different protein chains to merge\n"; print " - Path to the write the output alignment after processing\n"; print "\n"; if (@ARGV > 0) { print "Error: Insufficient input has been provided\n"; print "\n"; } exit 0; } my $MODE = $ARGV[0]; my $LOG = "multiprot_$MODE.log"; if (-f $LOG) { print "Error: The output log file $LOG already exists\n"; print "\n"; exit 1; } #------------------------------------------------------------------------------------------------------------------------ if ($MODE eq "pdb") { if (@ARGV < 5) { print "Error: Insufficient input has been provided\n"; print "\n"; exit 1; } my $PDB_IN = $ARGV[1]; my @CHAINS=(); my %CHAINS_HASH=(); foreach my $i (2 .. ($#ARGV-1)) { push(@CHAINS, $ARGV[$i]); if ($CHAINS_HASH{$ARGV[$i]}) { print "Error: ChainID $ARGV[$i] has been provided more than once in the input command\n"; print "\n"; exit 1; } $CHAINS_HASH{$ARGV[$i]}=1; } my $PDB_OUT = $ARGV[$#ARGV]; print "Input: INPUT PDB $PDB_IN\n"; print "Input: INPUT CHAINS [@CHAINS]\n"; print "Input: OUTPUT PDB $PDB_OUT\n"; print "Input: OUTPUT CHAIN_ID $CHAINS[0]\n"; print "\n"; if (-f $PDB_OUT) { print "Error: The output file $PDB_OUT already exists\n"; print "\n"; exit 1; } print "Info: Reading the input PDB file $PDB_IN\n"; my $pdb = readpdb($PDB_IN); print "Info: Renaming chains [@CHAINS] to chain $CHAINS[0] and updating the numbering\n"; open(LOG, ">$LOG") or die "Error: Failed to open log file $LOG for writing\n\n"; print LOG "#Input PDB: $PDB_IN\n"; print LOG "#Chains to merge: [@CHAINS]\n"; print LOG "#The ID of the merged chain: $CHAINS[0]\n"; print LOG "#Output PDB: $PDB_OUT\n"; print LOG "#The correspondence table between the input and output residue numbering is provided below\n"; print LOG "#\n"; print LOG "#Input ResName ChainID resID -> Output ResName ChainID resID\n"; my $lastnum=-999999; my $ITERATOR=0; my %chain_info = (); my @chains_all_inputorder = (); foreach my $id (sort {$a <=> $b} keys %{$pdb}) { my $chain = $pdb->{$id}->{"chainid"}; my $resid = $pdb->{$id}->{"resn"}; my $resname = $pdb->{$id}->{"res"}; if (!$chain_info{$chain}) {push(@chains_all_inputorder, $chain);} $chain_info{$chain}++; if ($CHAINS_HASH{$chain}) { my $log=0; if ($lastnum == -999999 || $lastnum != $resid) { $ITERATOR++; $log=1; } $lastnum=$resid; $pdb->{$id}->{"resn"}=$ITERATOR; if ($chain ne $CHAINS[0]) { $pdb->{$id}->{"chainid"}=$CHAINS[0]; } if ($log) {print LOG sprintf("%3s %1s %4s -> %3s %1s %4s\n",$resname, $chain, $resid, $resname, $pdb->{$id}->{"chainid"}, $pdb->{$id}->{"resn"});} } } print LOG "#EOF\n"; close(LOG); foreach my $chain (@CHAINS) { if (!$chain_info{$chain}) { print "Error: There is no chain $chain in the input PDB file\n"; print "\n"; exit 1; } } foreach my $chain (@chains_all_inputorder) { print "Info: Found ".($chain_info{$chain}-1)." atoms in chain $chain"; if ($CHAINS_HASH{$chain}) { print " -> converted to chain $CHAINS[0]"; } print "\n"; } print "Info: Writing the processed PDB to $PDB_OUT\n"; printpdb($pdb, $PDB_OUT); if (! -f $PDB_OUT || ! -s $PDB_OUT) { print "Error: Failed to write the output file\n"; print "\n"; exit 1; } print "Info: The correspondence table between the input and output residue numbering has been printed to $LOG\n"; } #------------------------------------------------------------------------------------------------------------------------ elsif ($MODE eq "aln") { if (@ARGV < 4) { print "Error: Insufficient input has been provided\n"; print "\n"; exit 1; } my @ALNS=(); my %ALNS_HASH=(); foreach my $i (1 .. ($#ARGV-1)) { push(@ALNS, $ARGV[$i]); $ALNS_HASH{$ARGV[$i]}++; } my $ALN_OUT = $ARGV[$#ARGV]; print "Input: INPUT FILE SEQUENCE CONTAINS ".($#ALNS+1)." ENTRIES\n"; foreach my $i (0..$#ALNS) { print "Input: INPUT ALIGNMENT #".($i+1)." $ALNS[$i] "; if ($ALNS_HASH{$ALNS[$i]} > 1) { print " ! (used $ALNS_HASH{$ALNS[$i]} times)"; } print "\n"; } print "Input: OUTPUT ALIGNMENT $ALN_OUT\n"; print "\n"; if (-f $ALN_OUT) { print "Error: The output file $ALN_OUT already exists\n"; print "\n"; exit 1; } print "Info: Reading the input multiple alignment files\n"; my %data_hash = (); my %common_seq = (); my $file_count = 0; foreach my $aln_file (@ALNS) { if ($data_hash{$aln_file}) {next;} $file_count++; my $aln = readaln($aln_file); $data_hash{$aln_file} = $aln; my @sequences = keys %{$aln}; print "Info: Alignment file $aln_file contains ".($#sequences+1)." sequences and ".($#{$aln->{$sequences[0]}}+1)." columns\n"; foreach my $seq (@sequences) { $common_seq{$seq}++; } } print "Info: Looking for common sequences which are present in all input alignment files\n"; my $seqlistidenticalinallfiles=1; foreach my $seq (keys %common_seq) { if ($common_seq{$seq} < $file_count) { delete($common_seq{$seq}); $seqlistidenticalinallfiles=0; } elsif ($common_seq{$seq} > $file_count) { print "Error: Sequence $seq has been observed $common_seq{$seq} times in $file_count files\n"; print "\n"; exit 1; } } my $total_seq_expected = (scalar keys %common_seq); my $total_columns_expect = 0; if ($total_seq_expected == 0) { print "Error: There are no proteins which are present in all input alignment files\n"; print "Error: Merge of alignment files is not possible, i.e., there is nothing to merge\n"; print "Error: The input files must contain alignments of parts of the same proteins, i.e., chains A and B\n"; print "\n"; exit 1; } if ($seqlistidenticalinallfiles) { print "Info: All input alignment files contain a common list of $total_seq_expected sequences\n"; #Calculate the expected number of columns of the merged alignment based on the current length of input alignments foreach my $aln_file (sort keys %ALNS_HASH) { my $aln = $data_hash{$aln_file}; my @sequences = keys %{$aln}; $total_columns_expect += (($#{$aln->{$sequences[0]}}+1) * $ALNS_HASH{$aln_file}); } } else { print "Info: There are $total_seq_expected sequences which are present in all input alignment files\n"; print "Info: Cleaning alignment data from sequences which are not present in all input alignment files\n"; foreach my $aln_file (sort keys %ALNS_HASH) { print "Info: Cleaning the alignment $aln_file\n"; my $aln = $data_hash{$aln_file}; #First delete all rows of an alignment corresponding to non-common sequences foreach my $seqname (keys %{$aln}) { if (!$common_seq{$seqname}) { delete($aln->{$seqname}); } } #Calculate the gap occurence in every remaining column my $ncolumns = -1; my %column_gap_counter=(); foreach my $seqname (keys %{$aln}) { if ($ncolumns == -1) { $ncolumns = $#{$aln->{$seqname}} + 1; foreach my $i (0..($ncolumns-1)) { $column_gap_counter{$i} = 0; } } elsif ($ncolumns != $#{$aln->{$seqname}} + 1) { print "Error: The number of columns for sequence $seqname is inconsistent with the previous sequence\n"; print "\n"; exit 1; } foreach my $i (0..$#{$aln->{$seqname}}) { my $aa = $aln->{$seqname}->[$i]; if ($aa eq '-') { $column_gap_counter{$i}++; } } } #Identify the gaps-only columns my %columns_gapped=(); my @sequences = keys %{$aln}; my $nseq = scalar @sequences; foreach my $i (0..$ncolumns-1) { if ($column_gap_counter{$i} == $nseq) { $columns_gapped{$i}=1; } } #Remove the gaps-only columns foreach my $seqname (keys %{$aln}) { my @arr = (); foreach my $i (0..$#{$aln->{$seqname}}) { push (@arr, $aln->{$seqname}->[$i]) unless $columns_gapped{$i}; } $aln->{$seqname} = \@arr; } print "Info: Removed ".(scalar keys %columns_gapped)." gaps-only columns in $aln_file after clean-up\n"; #Finally, increment the total_columns_expect counter by the length of this alignment multiplied by the number of times it will be merged $total_columns_expect += (($#{$aln->{$sequences[0]}}+1) * $ALNS_HASH{$aln_file}); } } print "Info: Merging alignments of common sequences in the requested order and printing the final alignment to $ALN_OUT\n"; open(FO, ">$ALN_OUT") or die "Error: Failed to open file $ALN_OUT for writing\n"; my $total_seq_fact = 0; my $total_columns_fact=0; foreach my $seqname (keys %{$data_hash{$ALNS[0]}}) { print FO ">$seqname\n"; my $c = 0; $total_columns_fact=0; foreach my $aln_file_id (0..$#ALNS) { my $aln_file = $ALNS[$aln_file_id]; my $aln = $data_hash{$aln_file}; foreach my $i (0..$#{$aln->{$seqname}}) { print FO $aln->{$seqname}->[$i]; $c++; $total_columns_fact++; if ($c % 60 == 0) { print FO "\n"; $c=0; } } } if ($c != 0) { print FO "\n"; } print FO "\n"; $total_seq_fact++; if ($total_columns_fact != $total_columns_expect) { print "Error: The number of columns for protein $seqname ($total_columns_fact) is different from what has been expected ($total_columns_expect)\n"; print "\n"; exit 1; } } print "Info: The merged alignment contains ".$total_seq_fact." sequences and ".$total_columns_fact." columns\n"; if ( $total_seq_fact != $total_seq_expected ) { print "Error: The output does not match expectations (i.e, $total_seq_expected sequences were expected)\n"; print "\n"; exit 1 } } #------------------------------------------------------------------------------------------------------------------------ else { print "Error: Unknown mode $MODE\n"; exit 0; } print "\n"; print "Done!\n"; #=================================================================== # SUBROUTINES #=================================================================== #Read in a pdb file sub readpdb { my $input = shift; my %pdb = (); open(FI, $input) or die "Error: Can not open file $input for reading\n\n"; #Read in the PDB file while() { chomp; #Remove Windows ^M sign s/\r//g; my $keyword = substr($_,0,6); # look for ATOM record if ($keyword eq "ATOM " || $keyword eq "HETATM") { my $length = length $_; if ($length < 54) { print "\n$_\n"; print "Error: ATOM and HETATM fields must be at least 54 characters in length (found $length))\n"; pdbFormatError(); } my $num = substr($_,6,5); $num =~ s/\s+//g; while ($pdb{$num}) { $num++; } $pdb{$num}{"head"} = substr($_,0,6); $pdb{$num}{"atom"} = substr($_,12,4); my $type = $pdb{$num}{"atom"}; $type=substr($type,0,1); $pdb{$num}{"type"} = $type; $pdb{$num}{"res"} = substr($_,16,4); $pdb{$num}{"chainid"} = substr($_,21,1); $pdb{$num}{"resn"} = substr($_,22,4); $pdb{$num}{"xcor"} = substr($_,30,8); $pdb{$num}{"ycor"} = substr($_,38,8); $pdb{$num}{"zcor"} = substr($_,46,8); #$pdb{$num}{"occ"} = substr($_,54,6); #$pdb{$num}{"bfac"} = substr($_,60,6); #$pdb{$num}{"segid"} = substr($_,72,4); #$pdb{$num}{"elm"} = substr($_,76,2); #$pdb{$num}{"charge"} = substr($_,78,2); #Remove spaces foreach my $key (keys %{$pdb{$num}}) { $pdb{$num}{$key} =~ s/\s+//g; } } } close(FI); return \%pdb; } sub pdbFormatError{ print "Error: The file submitted as PDB is not valid\n\n"; exit 1; } sub readaln { my $filename = shift; open(FI, $filename) or die "Error: Can not open file $filename for reading\n\n";; my %hash = (); my %hash_check=(); my $name = ""; my $seq = ""; while() { chomp; #Remove Windows ^M sign s/\r//g; if (/^>(.+)/) { if (!$name) {$name = $1; next;} my $tmp = $1; my @arr=split(//,$seq); #if ($hash{$name}) {push (@{$hash{$name}}, $chain);} # Insert chain delemeter after the first chain $hash{$name} = \@arr; # _Append_ new lines to the hash if ($hash_check{$name}) { print "Error: Sequence name \"$name\" used twice in one alignment\n"; fastaFormatError(); } $hash_check{$name}=1; $name = $tmp; $seq = ""; next; } #if (!$name) {die "Hard!!! Sequence without a name:\n$_\n";} #Convert to upper case $seq.= uc $_; } if ($name) { my @arr=split(//,$seq); $hash{$name} = \@arr; # _Append_ new lines to the hash if ($hash_check{$name}) { print "Error: Sequence name \"$name\" used twice in one alignment\n"; fastaFormatError(); } } else { print "Error: Unexpected end-of-file\n"; fastaFormatError(); } return \%hash; } sub fastaFormatError{ print "Error: The file submitted is not a valid FASTA Alignment \n"; print "Error: You can read more about the FASTA format and Multiple sequence alignments\n"; print "Error: Please revise your data and resubmit in a new task\n\n"; exit 1; } #Print a pdb file sub printpdb { my $pdb = shift; my $out = shift; #Find the last id-index my @old_ids = keys %{$pdb}; my $min = min @old_ids; my $max = max @old_ids; #Now print out the results in the topology atom order open(FO, ">$out") or die "ERROR: Can not open file $out for writing\n\n"; my $new_id = 0; foreach my $old_id ($min..$max) { #We have deleted some entries or they could have been missing in the original file #So simply skip such cases if (!$pdb->{$old_id}) { next; } $new_id++; my $ATOM_NAME = sprintf("%-3s", $pdb->{$old_id}{"atom"}); $ATOM_NAME = sprintf("%5s", $ATOM_NAME); printf FO "%-6s%5s$ATOM_NAME%4s %1s%4d %8.3f%8.3f%8.3f\n",#%6.2f%6.2f %4s%2s%2s\n", $pdb->{$old_id}{"head"}, $new_id, #$pdb->{$old_id}{"atom"}, $pdb->{$old_id}{"res"}, $pdb->{$old_id}{"chainid"}, $pdb->{$old_id}{"resn"}, $pdb->{$old_id}{"xcor"}, $pdb->{$old_id}{"ycor"}, $pdb->{$old_id}{"zcor"}; #, #$pdb->{$old_id}{"occ"}, #$pdb->{$old_id}{"bfac"}, #$pdb->{$old_id}{"segid"}, #$pdb->{$old_id}{"elm"}, #$pdb->{$old_id}{"charge"}; } close(FO); }