#!/usr/bin/perl ########################### # ASAP # Automated Simultaneous Analysis Phylogeny # # syntax: # ASAP [input dir] [force|justMatrix] # # August 2007 # # Indra Neil Sarkar, PhD # MBLWHOI Library # Marine Biological Laboratory # 7 MBL Street # Woods Hole, MA 02543 USA # # sarkar@mbl.edu # ########################### #### #### DEFINE SETTINGS #### # DESIGNATED OUTGROUP $defineOutgroup = ""; # LOCATION OF PAUP* #$paupApp = "/usr/local/bin/paup"; $paupApp = `which paup`; chomp($paupApp); # TREE SEARCH METHOD # tree search method for simultaneous analysis tree & support tests $treeSearchMethod = "hsearch nreps=200 swap=tbr multrees=no"; # BOOTSTRAP TREE SEARCH METHOD # specify method for bootstrap tree searches # $bootstrapMethod = "search=bandb"; $bootstrapMethod = ""; # SUPPORT TEST PARAMETERS # define which support tests will be performed # 0 = NO; 1 = YES # runBS -- Branch Support # runPBS -- Partitioned Bremer Support # runILD -- Incongruence Length Difference $runBS = 1; $runPBS = 1; $runILD = 0; # ALIGNMENT PROGRAM SYNTAX # define alignment program syntax # will replace [inputFile] with input FASTA file # & [outputfile] with output FASTA file $alignmentApp = `which muscle`; chomp($alignmentApp); $alignmentProgramSyntax = "$alignmentApp -in [inputFile] -out [outputFile]"; ######################################################################## # ** THERE SHOULD BE NO NEED TO CHANGE ANY OF THE SETTINGS BELOW HERE ** ######################################################################## $version = "1.0"; # make sure input syntax is correct if (@ARGV < 1) { print "Incorrect Syntax:\n"; print " ASAP [input dir] [force|justMatrix]\n\n"; exit; } # get input directory name from command line; # and remove any trailing /'s $inputDir = $ARGV[0]; $inputDir =~ s/\/$//g; $forceASAP = 0; $justMatrix = 0; # get forceASAP flag if ($ARGV[1] =~ m/^force$/i) { $forceASAP = 1; } # get justMatrix flag (always force) if ($ARGV[1] =~ m/^justMatrix/i) { $forceASAP = 1; $justMatrix = 1; } # set Alignment pending and partition directory names $pendingAlignmentDirName = "$inputDir/_pendingAlignment"; $partitionDirName = "$inputDir/_partitions"; # define NEXUS fileName $nexusFileName = "$inputDir/_NEXUS.nex"; # create supportTests output directory (supportTests) $outputDirName = "$inputDir/supportTests"; if (-e "$outputDirName"){} else { `mkdir $outputDirName`; } # create outputTreeDirectory (trees) $outputTreeDirName = "$inputDir/trees"; if (-e "$outputTreeDirName"){} else { `mkdir $outputTreeDirName`; } # load custom settings file $customSettingsFileName = "$inputDir/ASAP.settings"; open (customSettingsFile, $customSettingsFileName); while () { chomp $_; if ($_ !~ m/^#/) { ($attribute, $value) = split(/\=/, $_); # get designated outgroup names if ($attribute =~ m/^OUTGROUP/i) { $defineOutgroup = $value; } # get custom partitions if ($attribute =~ s/^CHARSET[^A-Z0-9]*//i) { $attribute =~ s/[^A-Z0-9\,\.]//g; $value =~ s/[^A-Z0-9\,]//g; $customPartitionHash{$attribute} = $value; } # hidden force flags $forceASAP = 1 if ($attribute =~ m/^forceASAP/i); $runILD = 1 if ($attribute =~ m/^runILD/i); $justMatrix = 1 if ($attribute =~ m/^justMatrix/i); } } close (customSettingsFile); # open pendingAlignment directory opendir(pendingAlignmentDir, $pendingAlignmentDirName); # process all the contents of the pendingAlignment directory while (defined($pendingFileName = readdir(pendingAlignmentDir))) { # skip '.' and '..' next if $pendingFileName =~ /^\.\.?/; # add the directory name to the pendingFileName $pendingFileName = "$pendingAlignmentDirName/$pendingFileName"; # replace [inputFile] in the currentSyntax with the current filename $currAlignmentSyntax = $alignmentProgramSyntax; $currAlignmentSyntax =~ s/\[inputFile\]/$pendingFileName/; # replace [outputFile] with output file name, by just changing dir name $doneFileName = $pendingFileName; $doneFileName =~ s/^$pendingAlignmentDirName/$partitionDirName/; $currAlignmentSyntax =~ s/\[outputFile\]/$doneFileName/; # run alignment and remove pending file `$currAlignmentSyntax; rm $pendingFileName`; } # close the pendingAlignment directory close (pendingAlignmentDir); # check for currList file and trigger runASAP flag accordingly $runASAP = 0; if (-e "$inputDir/currList") { # create newList file `ls -l $partitionDirName > $inputDir/newList`; # compare newList and oldList $diffValue = `diff $inputDir/currList $inputDir/newList | wc -l`; # if the files are different, then trigger flag to run ASAP # and update currList file if ($diffValue != 0) { # trigger ASAP flag $runASAP = 1; # replace currList with newList `mv $inputDir/newList $inputDir/currList`; } # otherwise, delete the newList file (since no difference detected) else { # delete newList file `rm $inputDir/newList`; } } # in the event that the currList file does not exist, create it # and set ASAP to run else { # create currList file `ls -l $partitionDirName > $inputDir/currList`; # set runASAP flag to run $runASAP = 1; } # run ASAP if the runASAP flag has been triggered if ($runASAP == 1 || $forceASAP == 1) { print "run ASAP!\n"; #### #### CREATE NEXUS FILE FROM FASTA FILES #### # open partitionDirName directory as the faa file directory opendir (faaDirName, $partitionDirName); # process each file withing the faa file directory while (defined($faaFileName = readdir(faaDirName))) { # skip '.' and '..' next if $faaFileName =~ /^\.\.?/; ### Convert input files to UNIX format... open (faaFile, "$partitionDirName/$faaFileName"); # read in file and convert to UNIX newlines $_ = ; if ($_ =~ /\r/) { # instantiate temp file open (faaTemp, ">$partitionDirName/faaTemp"); # translate line breaks to UNIX & store to temp file $_ =~ s/\r\n/\n/g; # PC --> UNIX $_ =~ s/\r/\n/g; # MAC -> UNIX print (faaTemp "$_"); # replace pirFile with UNIX line breaks unlink ($faaFileName); system ("mv $partitionDirName/faaTemp $partitionDirName/$faaFileName"); close (faaFile); } else { close (faaFile); } # create partition name based on fileName -- remove extension # clean out any underscores or dashes -- replace with '.'s $partitionName = $faaFileName; $partitionName =~ s/\..*//; $partitionName =~ s/\_/\./g; $partitionName =~ s/-/\./g; # print "Processing partition: $partitionName [$faaDir/$faaFileName]\n"; # add partitionName to partitionHash $partitionHash{$partitionName} = 1; %taxonFileHash = (); # read in sequences open (faaFile, "$partitionDirName/$faaFileName"); while () { chomp $_; # replace tabs with spaces and replace double newlines with single ones $_ =~ s/\t/ /g; $_ =~ s/\n\n/\n/; # process comment line if ($_ =~ /[>]/) { # get taxon name from comment line $taxonName = "$_"; $taxonName =~ s/>//; $taxonHash{$taxonName}++; } # process sequence lines else { # add sequence to partitiontaxon Hash $partitiontaxonHash{$partitionName}{$taxonName} .= $_; # store length of partition to partitionLength Hash $partitionLength{$partitionName} = length ($partitiontaxonHash{$partitionName}{$taxonName}); } } } close (faaDirName); # initialize current character position $currentChar = 0; # go through each partition and print out matrix to nexus file foreach $partition (sort keys %partitionHash) { # determine start and end of character positions for partition $startPar{$partition} = $currentChar + 1; $endPar{$partition} = $startPar{$partition} + $partitionLength{$partition} - 1; $currentChar = $endPar{$partition}; # add partition comment line to data matrix $datamatrix .= "\n\t[partition \"$partition\" chars $startPar{$partition} - $endPar{$partition}]\n"; # go through taxon list and if in partition, print out matrix foreach $taxonName (sort keys %taxonHash) { # if taxon exists in partition, print out sequence if (exists $partitiontaxonHash{$partition}{$taxonName}) { # add sequence to data matrix $datamatrix .= "\t$taxonName\t$partitiontaxonHash{$partition}{$taxonName}\n"; } # otherwise, print out gaps for length of sequence else { # print out taxon name to data matrix $datamatrix .= "\t$taxonName\t"; # print out gaps for length of partition to data matrix for ($i=0; $i<$partitionLength{$partition}; $i++) { $datamatrix .= "?"; } $datamatrix .= "\n"; # add to exclusion taxSet Hash $excludeTaxSetHash{$partition}{$taxonName} = 1; } } } # determine number of taxons for NEXUS file $taxonNameCount = 0; foreach $taxonName (keys %taxonHash) { $taxonNameCount++; } ### print out NEXUS file # get current time & date ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime (time); if ($Month < 10) { $Month = "0" . $Month; } if ($Day < 10) { $Day = "0" . $Day; } if ($Minute < 10) { $Minute = "0" . $Minute;} if ($Second < 10) { $Second = "0" . $Second;} $Month++; $Year += 1900; $todayDate = "$Year.$Month.$Day $Hour:$Minute:$Second"; #print "\nWriting Nexus file to $nexusFileName\n"; open (nexusFile, ">$nexusFileName"); # header line for nexus files print nexusFile "#NEXUS\n\n"; print nexusFile "[$todayDate; file auto-generated by ASAP $version written by I.N. Sarkar]\n\n"; ## data block print nexusFile "Begin DATA;\n"; print nexusFile "\tDimensions ntax=$taxonNameCount nchar=$currentChar;\n"; print nexusFile "\tFormat symbols=\"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\" "; print nexusFile " interleave gap=- missing=?;\n"; print nexusFile "\tMatrix\n"; print nexusFile "$datamatrix\n"; print nexusFile ";\n"; print nexusFile "End;\n\n"; ## assumption block print nexusFile "Begin Assumptions;\n\n"; print nexusFile "\tOPTIONS DEFTYPE=unord PolyTcount=MINSTEPS;\n"; print nexusFile "\n"; # print out each partition charset foreach $partition (sort keys %partitionHash) { # print out partition charset print nexusFile "\tcharset $partition = $startPar{$partition}-$endPar{$partition};\n"; # craft the totalLine $totalLine .= "$partition:$startPar{$partition}-$endPar{$partition},"; } # print out custom partition charsets foreach $partition (sort keys %customPartitionHash) { print nexusFile "\tcharset $partition = "; foreach $subPartition (split(/,/, $customPartitionHash{$partition})) { print nexusFile " $startPar{$subPartition}-$endPar{$subPartition}"; } print nexusFile ";\n"; } # print out custom partitions & add custom partitions to charsetHash # clean out last comma from totalLine $totalLine =~ s/\,$/;/; # print out taxon exclusion sets for each partition print nexusFile "\n"; foreach $partition (sort keys %excludeTaxSetHash) { print nexusFile "\ttaxset missing\_$partition ="; foreach $taxonName (keys %{$excludeTaxSetHash{$partition}}) { print nexusFile " $taxonName"; } print nexusFile ";\n"; $missingTaxaList{"missing\_$partition"}++; } # print out taxon exclusion sets for custom partition charsets foreach $partition (sort keys %customPartitionHash) { $hasMissingTaxa = 0; %missingCustomHash = (); foreach $subPartition (split(/,/, $customPartitionHash{$partition})) { if (exists $excludeTaxSetHash{$subPartition}) { $hasMissingTaxa = 1; foreach $taxonName (keys %{$excludeTaxSetHash{$subPartition}}) { $missingCustomHash{$taxonName}++; } } } if ($hasMissingTaxa == 1) { print nexusFile "\ttaxset missing\_$partition ="; foreach $taxonName (sort keys %missingCustomHash) { print nexusFile " $taxonName"; } print nexusFile ";\n"; } } # close assumption block print nexusFile ";\n"; print nexusFile "End;\n"; ## paup block for ILD if ($runILD == 1) { print nexusFile "\n"; print nexusFile "Begin Paup;\n"; # print out CHARpar total line print nexusFile "\tCHARpar total = $totalLine\n"; # print out the remaining CHARpar lines # print out the pair-wise comparison CHARpar partitions foreach $partitionOne (sort keys %partitionHash) { # for each possible pair, only start printing out AFTER # current partition name $processPar = 0; foreach $partitionTwo (sort keys %partitionHash) { # if greater than first homTest, only print if either partition has # partition group symbol (&) if (($homTestNumber > 1) && ($partitionOne !~ m/\&/ && $partitionTwo !~ m/\&/)) { $processPar = 0; } # only print out CHARpar pair if processPar flag is set to 1 if ($processPar == 1) { # assign names to the partitions according to charsets $partitionOneName = $partitionOne; $partitionTwoName = $partitionTwo; # print out CHARpar line, starting with the first partition print nexusFile "\tCHARpar $partitionOneName\_$partitionTwoName = $partitionOneName:"; @partitionParts = split (/\&/, $partitionOne); # for each of the partition parts for the first partition, print out the start-end positions foreach $partitionPart (@partitionParts) { print nexusFile "$startPar{$partitionPart}-$endPar{$partitionPart} "; } # print out the second partition for the CHARpar line print nexusFile ", $partitionTwoName:"; @partitionParts = split (/\&/, $partitionTwo); # for each of the partition parts for the second partition, print out the start-end positions foreach $partitionPart (@partitionParts) { print nexusFile "$startPar{$partitionPart}-$endPar{$partitionPart} "; } print nexusFile ";\n"; } # if the two partitions are the same, then flip the processPar flag if ($partitionOne eq $partitionTwo) { $processPar = 1; } } } print nexusFile "End;\n"; } close (nexusFile); # exit application if only justMatrix flag is set. if ($justMatrix == 1) { print "\n\nMatrix created as: $nexusFileName\n\n"; exit; } #### #### GET INDIVIDUAL PARTITION & SIMULTANEOUS ANALYSIS TREES #### # instantiate command file to create trees open (createTreeCmdFile, ">$inputDir/getTrees.cmd"); print createTreeCmdFile "#NEXUS\n\n"; print createTreeCmdFile "set warnReset = no;\n"; print createTreeCmdFile "set increase = auto;\n"; # perform tree search & calculate bootstraps for SA tree print createTreeCmdFile "[SIMULTANEOUS ANALYSIS]\n"; print createTreeCmdFile "execute $nexusFileName;\n"; print createTreeCmdFile "outgroup $defineOutgroup;\n"; print createTreeCmdFile "$treeSearchMethod;\n"; print createTreeCmdFile "filter best;\n"; print createTreeCmdFile "savetrees file=$outputTreeDirName/_SA_AllTrees.nex format=nexus replace=yes root=yes;\n"; print createTreeCmdFile "contree /strict=yes treeFile=$outputTreeDirName/_SA_Consensus.nex replace=yes;\n"; print createTreeCmdFile "execute $outputTreeDirName/_SA_Consensus.nex;\n"; print createTreeCmdFile "savetrees file=$outputTreeDirName/_SA_Consensus.nex format=nexus replace=yes root=yes;\n"; print createTreeCmdFile "bootstrap $bootstrapMethod;\n"; print createTreeCmdFile "savetrees from=1 to=1 file=$outputTreeDirName/_SA_Bootstrap.nex savebootp=nodelabels maxdecimals=0 format=nexus replace=yes;\n\n"; # perform tree searches for each individual partition -- and do bootstraps foreach $partition (sort keys %partitionHash) { print createTreeCmdFile "[$partition]\n"; print createTreeCmdFile "execute $nexusFileName;\n"; print createTreeCmdFile "outgroup $defineOutgroup;\n"; print createTreeCmdFile "exclude all;\n"; print createTreeCmdFile "include $partition;\n"; print createTreeCmdFile "$treeSearchMethod;\n"; print createTreeCmdFile "filter best;\n"; print createTreeCmdFile "root;\n"; print createTreeCmdFile "savetrees file=$outputTreeDirName/$partition\_AllTrees.nex format=nexus replace=yes root=yes;\n"; print createTreeCmdFile "contree /strict=yes treeFile=$outputTreeDirName/$partition\_Consensus.nex replace=yes;"; print createTreeCmdFile "bootstrap $bootstrapMethod;\n"; print createTreeCmdFile "savetrees from=1 to=1 file=$outputTreeDirName/$partition\_Bootstrap.nex savebootp=nodelabels maxdecimals=0 format=nexus replace=yes;\n\n"; } # perform tree searches for each individual custom partition -- and do bootstraps foreach $partition (sort keys %customPartitionHash) { print createTreeCmdFile "[$partition]\n"; print createTreeCmdFile "execute $nexusFileName;\n"; print createTreeCmdFile "outgroup $defineOutgroup;\n"; print createTreeCmdFile "exclude all;\n"; print createTreeCmdFile "include $partition;\n"; print createTreeCmdFile "$treeSearchMethod;\n"; print createTreeCmdFile "filter best;\n"; print createTreeCmdFile "root;\n"; print createTreeCmdFile "savetrees file=$outputTreeDirName/$partition\_AllTrees.nex format=nexus replace=yes root=yes;\n"; print createTreeCmdFile "contree /strict=yes treeFile=$outputTreeDirName/$partition\_Consensus.nex replace=yes;"; print createTreeCmdFile "bootstrap $bootstrapMethod;\n"; print createTreeCmdFile "savetrees from=1 to=1 file=$outputTreeDirName/$partition\_Bootstrap.nex savebootp=nodelabels maxdecimals=0 format=nexus replace=yes;\n\n"; } print createTreeCmdFile "quit warnTsave=no;\n"; # close command file and execute it in Paup* close (createTreeCmdFile); system ("$paupApp $inputDir/getTrees.cmd"); #### #### RUN SUPPORT TESTS #### # read in tree file and pull just the tree --> N.B., only reads single (last) tree open (inputTreeFile, "$outputTreeDirName/_SA_Consensus.nex"); while () { chomp $_; # pull taxa translation information if ($_ =~ /\tTranslate/) { while () { chomp $_; if ($_ =~ /;/) { last; } else { $_ =~ s/[\t,]//g; ($taxonNumber, $taxonName) = split (/\s/, $_); $taxonTranslateHash{$taxonName} = $taxonNumber; $taxaCount++; $taxonNumTranslateHash{$taxonNumber} = $taxonName; } } } # only pull the tree line if ($_ =~ /^\t?tree/i) { # clean out paup comments $_ =~ s/\[.*\]//g; # separate out the tree from the paup directives ($paup, $inputTree) = split (/=/, $_); # clean out extranous spaces and semicolon from newick tree $inputTree =~ s/[\s;]//g; } } close (inputTreeFile); # translate input tree $translatedTree = $inputTree; foreach $taxonNum (keys %taxonNumTranslateHash) { $translatedTree =~ s/$taxonNum([,\)])/$taxonNumTranslateHash{$taxonNum}$1/; $translatedTree =~ s/$taxonNum$/$taxonNumTranslateHash{$taxonNum}$1/; } $translatedTree =~ tr/[\(\)]/[<>]/; #print "inputTree==> >$inputTree<\n"; #print "inputTree trans==> >$translatedTree<\n"; # get the list of taxon names from inputtree $taxonList = $inputTree; $taxonList =~ s/[\(,\)]/ /g; $taxonList =~ s/\s+/ /g; $taxonList =~ s/^[\s]+//g; $taxonList =~ s/[\s]+$//g; # populate taxonHash foreach $taxon (split (/\s/, $taxonList)) { $taxonHash{$taxon}++; #print "taxon:: >$taxon<\n"; } # get list of nodes @listNodes = &getNodes($inputTree,"()"); # delete root node shift @listNodes; # store each of the nodes as appropriate and $nodeNumber = 0; foreach $constraintNode (@listNodes) { #print "constrain--> $constraintNode\n"; # create node name (node#) $nodeNumber++; $conName = "Node$nodeNumber"; # clean up the constraint node, so that it is only comma-delimited $constraintNode =~ s/[\(,\)]/ /g; $constraintNode =~ s/^[\s]+//g; $constraintNode =~ s/[\s]+$//g; $constraintNode =~ s/\s+/,/g; # load the taxa from constraint node into constraintHash %constraintHash = (); foreach $taxon (split (/,/, $constraintNode)) { $constraintHash{$taxon}++; } # translate the taxa names in each constraint node $listConstraintNodeTranslate = ""; foreach $taxonNum (split (/,/, $constraintNode)) { $listConstraintNodeTranslate .= "$taxonNumTranslateHash{$taxonNum},"; } $listConstraintNodeTranslate =~ s/,$//; #print "list---> $listConstraintNodeTranslate\n"; # store the translated list of taxa names into hash $listConstraintTaxa{$conName} = $listConstraintNodeTranslate; # store the translated list of taxa names into hash $listConstraintTaxa{$conName} = $listConstraintNodeTranslate; # put parens around the constraint node $constraintNode = "($constraintNode)"; # add the rest of taxa from the non-constrained part of the tree foreach $taxon (keys %taxonNumTranslateHash) { if (!exists $constraintHash{$taxon}) { $constraintNode = "$taxon,$constraintNode"; } } # put parens around the whole tree $constraintNode = "($constraintNode)"; #print "constraintNode: $constraintNode\n"; # add constraintNode to conNameHash $conNameHash{$conName} = $constraintNode; } open (inputFile, $nexusFileName); while () { chomp $_; # read in the number of characters from Dimensions line if ($_ =~ s/\tDimensions //) { $numTaxa = $_; $numTaxa =~ s/\s.*//; $numTaxa =~ s/^.*\=//; } # if line is a charset line, read in charset name if ($_ =~ s/charset//i) { # clean out junk chars and parse out just charset name $_ =~ s/[\t\s]//g; $_ =~ s/\=.+$//; # store charset name into hash $charsetHash{$_}++; #print "Charset: \"$_\"\n"; } # read in (missing) taxset lines if ($_ =~ s/taxset//i) { # clean out junk chars and parse out just charset name $_ =~ s/^\s//; $_ =~ s/[\t;]//g; ($missingName, $missingTaxa) = split (/\=/, $_); $missingTaxa =~ s/^\s//; $missingName =~ s/\s//g; # take note of only the missing taxa... if ($missingName =~ /missing/) { # store the missingTaxa to associate with missing_partition $missingTaxaList{$missingName} = $missingTaxa; # determine number of missing taxa @individualTaxa = split (/\s+/, $missingTaxa); $missingTaxaListCount{$missingName} = $#individualTaxa + 1; # store charset name into hash $taxsetHash{$_}++; } } } # add noConstraints to conNameTreeHash $conNameHash{"_noConstraint"}++; # add "all" to charsets $charsetHash{"_all"}++; # create root output directory if (-e $outputDirName) {} else {mkdir($outputDirName, 0777)}; # go through and create this directories if they have not been created yet foreach $conName (sort keys %conNameHash) { #print "creating $directory\n"; # create output directory if it does not already exist $directoryPBS = "$outputDirName/PBS"; $directoryPBSCon = "$outputDirName/PBS/$conName"; # create PBS directory if (-e $directoryPBS) {} else {mkdir($directoryPBS, 0777)}; # create constraint directory if (-e $directoryPBSCon) {} else {mkdir($directoryPBSCon, 0777)}; } # create BS directory $directoryBS = "$outputDirName/BS"; if (-e $directoryBS) {} else {mkdir($directoryBS, 0777)}; # create ILD directory if ($runILD == 1) { $directoryILD = "$outputDirName/ILD"; if (-e $directoryILD) {} else {mkdir($directoryILD, 0777)}; } ######### PBS # create paup command files to do (Partition Bremer Support) at each constraint # and then execute it if ($runPBS == 0) { print "\tSkipping PBS analyses..\n"; } elsif ($runPBS == 1) { print "\tNow performing PBS analyses...\n"; foreach $conName (sort keys %conNameHash) { # instantiate commandFile $commandFileName = "$outputDirName/PBS/$conName/\_$conName\_PBS\_command.paup"; open (commandFile, ">$commandFileName"); # print nexus line print commandFile "#NEXUS\n\n"; print commandFile "set increase = auto;\n"; print commandFile "set WarnRoot=no;\n"; # execute the matrix & tree file print commandFile "execute $nexusFileName;\n"; print commandFile "execute $outputTreeDirName/_SA_Consensus.nex;\n"; # if testing no constraint, use $treeSearchMethod to get the full tree if ($conName eq "_noConstraint") { print commandFile "$treeSearchMethod noenforce;\n"; } # otherwise, constrain the tree according to the current constraint name else { print commandFile "CONSTRAINTS $conName = $conNameHash{$conName};\n"; print commandFile "$treeSearchMethod enforce converse constraints = $conName;\n"; } print commandFile "\n"; # for each character set # send output to appropriate log file, and send lenfit values to log file foreach $charset (sort keys %charsetHash) { $charset =~ s/^\_//; print commandFile "[PBS for charset $charset at constraint $conName]\n"; print commandFile "log start file = $outputDirName/PBS/$conName/$charset\_PBS.log replace = yes;\n"; print commandFile "[!\n**** PBS for charset $charset at constraint $conName ****]\n"; print commandFile "exclude all;\n"; print commandFile "include $charset;\n"; print commandFile "lenfit /CI RI RC;\n"; print commandFile "log stop;\n"; print commandFile "\n"; } print commandFile "\n"; # exit paup, with no warning print commandFile "quit /warnTsave=no;\n"; # close out the commandFile close (commandFile); # now execute the command file in paup #`$paupApp $commandFileName`; system ("$paupApp $commandFileName"); } } ######### BS # create paup command files to perform BS (Branch support) at each partition # and then execute it if ($runBS == 0) { print "\tSkipping BS analyses...\n"; } elsif ($runBS == 1) { print "\tNow performing BS analyses...\n"; # instantiate BS command file $commandFileName = "$outputDirName/BS/\_BS\_command.paup"; open (commandFile, ">$commandFileName"); # print nexus line print commandFile "#NEXUS\n\n"; print commandFile "set increase = auto;\n"; print commandFile "set WarnRoot=no;\n"; # execute the matrix & treeFile print commandFile "execute $nexusFileName;\n"; print commandFile "execute $outputTreeDirName/_SA_Consensus.nex;\n"; # define constraints foreach $conName (sort keys %conNameHash) { if ($conName ne "_noConstraint") { print commandFile "CONSTRAINTS $conName = $conNameHash{$conName};\n"; } } # run commands for each charset foreach $charset (sort keys %charsetHash) { $charset =~ s/^\_//; print commandFile "[BS for charset $charset]\n"; print commandFile "log start file = $outputDirName/BS/$charset\_BS.log replace = yes;\n"; print commandFile "[!\n **** BS for charset $charset ****]\n"; print commandFile "exclude all;\n"; print commandFile "include $charset;\n"; # process each of the constraints foreach $conName (sort keys %conNameHash) { print commandFile "[!\n==>constraint: $conName\n]\n"; # if testing no constraint, use $treeSearchMethod to get the full tree if ($conName eq "_noConstraint") { print commandFile "$treeSearchMethod noenforce;\n"; } # otherwise, constrain the tree according to the current constraint name else { print commandFile "$treeSearchMethod enforce converse constraints = $conName;\n"; } print commandFile "lenfit /CI RI RC;\n"; } print commandFile "log stop;\n"; print commandFile "\n"; } # exit paup, with no warning print commandFile "quit /warnTsave=no;\n"; close (commandFile); # run BS command file #`$paupApp $commandFileName`; system ("$paupApp $commandFileName"); } ######### TABULATE & CALCULATE BS, PBS, & PHBS # tabulate the PBS values print "\tNow tabulating PBS values...\n"; foreach $conName (sort keys %conNameHash) { # get the information from each log file @PBSfiles = split (/\n/, `ls $outputDirName/PBS/$conName/*_PBS.log`); foreach $PBSfile (@PBSfiles) { # get the charset name from the file name $charset = $PBSfile; $charset =~ s/^.+\///g; $charset =~ s/\_.+//; open (inputFile, $PBSfile); while () { # get tree length #$length = `grep \"Length \" $PBSfile`; chomp $_; # number of Characters if ($_ =~ s/^\s+Of the remaining\s+//) { $numChars = $_; $numChars =~ s/\s+included characters:$//; } # number of Parsimony Informative Characters chomp $parsInf; if ($_=~ s/^\s+Number of \(included\) parsimony-informative characters =\s+//) { $parsInf = $_; } if ($_ =~ s/^\s*Length\s+//) { $_ =~ s/\s+.+//; $length = $_; } # get CI if ($_ =~ s/^\s*CI\s+//) { $_ =~ s/\s+.+//; $ci = $_; } # get RI if ($_ =~ s/^\s*RI\s+//) { $_ =~ s/\s+.+//; $ri = $_ } # get RC if ($_ =~ s/^\s*RC\s+//) { $_ =~ s/\s+.+//; $rc = $_ } } close (inputFile); # store values into appropriate hashes $PBSlengthHash{$charset}{$conName} = $length; $PBSciHash{$charset}{$conName} = $ci; $PBSriHash{$charset}{$conName} = $ri; $PBSrcHash{$charset}{$conName} = $rc; $PBSnumCharsHash{$charset}{$conName} = $numChars; $PBSparsInfHash{$charset}{$conName} = $parsInf; } } # tabulate the BS values print "\tNow tabulating BS values...\n"; @BSfiles = split (/\n/, `ls $outputDirName/BS/*_BS.log`); foreach $BSfile (@BSfiles) { # get charset from file name $charset = $BSfile; $charset =~ s/^.+\///g; $charset =~ s/\_.+//; # process each branch-support file open (inputFile, $BSfile); while () { chomp $_; # get constraint name if ($_ =~ s/==>constraint: //) { $conName = $_; } # number of Characters if ($_ =~ s/^\s+Of the remaining\s+//) { $numChars = $_; $numChars =~ s/\s+included characters:$//; } # number of Parsimony Informative Characters chomp $parsInf; if ($_ =~ s/^\s+Number of \(included\) parsimony-informative characters =\s+//) { $parsInf = $_; } # number of trees retained if ($_ =~ s/^\s+Number of trees retained =\s+//) { $numTrees = $_; } # get best tree length if ($_ =~ s/^\s*Length\s+//) { $_ =~ s/\s+.+//; $length = $_; } # get CI if ($_ =~ s/^\s*CI\s+//) { $_ =~ s/\s+.+//; $ci = $_; } # get RI if ($_ =~ s/^\s*RI\s+//) { $_ =~ s/\s+.+//; $ri = $_; } # get RC -- After getting RC, load values into appropriate hashes if ($_ =~ s/^\s*RC\s+//) { $_ =~ s/\s+.+//; $rc = $_; # store values into appropriate hashes $BSlengthHash{$charset}{$conName} = $length; $BSciHash{$charset}{$conName} = $ci; $BSriHash{$charset}{$conName} = $ri; $BSrcHash{$charset}{$conName} = $rc; $BSnumTreesHash{$charset}{$conName} = $numTrees; $BSnumCharsHash{$charset}{$conName} = $numChars; $BSparsInfHash{$charset}{$conName} = $parsInf; } } } # now, calculate the PBS, BS, and PHBS values foreach $charset (keys %charsetHash) { foreach $conName (keys %{$BSlengthHash{$charset}}) { # PBS value = conName - noConstraint $PBSvalueHash{$charset}{$conName} = $PBSlengthHash{$charset}{$conName} - $PBSlengthHash{$charset}{"_noConstraint"}; # BS value = conName - noConstraint $BSvalueHash{$charset}{$conName} = $BSlengthHash{$charset}{$conName} - $BSlengthHash{$charset}{"_noConstraint"}; # PHBS value = PBS - BS $PHBSvalueHash{$charset}{$conName} = $PBSvalueHash{$charset}{$conName} - $BSvalueHash{$charset}{$conName}; # sum up the PBS, BS, and PHBS for total values $PBStotal{$conName} += $PBSvalueHash{$charset}{$conName}; $BStotal{$conName} += $BSvalueHash{$charset}{$conName}; $PHBStotal{$conName} += $PHBSvalueHash{$charset}{$conName} } } if ($runPBS == 1 || $runBS == 1) { # send output to tab-delimited file $outputFileName = "$inputDir\/_branchSupports.txt"; print "\tSending tab-delimited results to $outputFileName\n"; open (outputFile, ">$outputFileName"); # send output, organized on a node-by-node basis foreach $conName (sort keys %conNameHash) { print outputFile "Constraint: $conName ($listConstraintTaxa{$conName})\n"; print outputFile "Charset\tPBS[treeLength]\tPBS[numChars]\tPBS[ParsInf]\tPBS[CI]\tPBS[RI]\tPBS[RC]\t"; print outputFile "BS[numTrees]\tBS[treeLength]\tBS[numChars]\tBS[ParsInf]\tBS[CI]\tBS[RI]\tBS[RC]\t"; print outputFile "PBS[value]\tBS[value]\tPHBS[value]\n"; foreach $charset (sort {lc $a cmp lc $b} keys %charsetHash) { $charset =~ s/^\_//; print outputFile "$charset\t"; print outputFile "$PBSlengthHash{$charset}{$conName}\t$PBSnumCharsHash{$charset}{$conName}\t$PBSparsInfHash{$charset}{$conName}\t$PBSciHash{$charset}{$conName}\t$PBSriHash{$charset}{$conName}\t$PBSrcHash{$charset}{$conName}\t"; print outputFile "$BSnumTreesHash{$charset}{$conName}\t$BSlengthHash{$charset}{$conName}\t$BSnumCharsHash{$charset}{$conName}\t$BSparsInfHash{$charset}{$conName}\t$BSciHash{$charset}{$conName}\t$BSriHash{$charset}{$conName}\t$BSrcHash{$charset}{$conName}\t"; print outputFile "$PBSvalueHash{$charset}{$conName}\t"; print outputFile "$BSvalueHash{$charset}{$conName}\t"; print outputFile "$PHBSvalueHash{$charset}{$conName}\t"; print outputFile "\n"; } print outputFile "\n\n"; } close (outputFile); } # print out values to appropriate tree files # insert PBS values $PBStree = $translatedTree; foreach $conName (sort keys %translatedConstraintHash) { $PBStree =~ s/($translatedConstraintHash{$conName})/$1$PBStotal{$conName}/; } # insert BS values $BStree = $translatedTree; foreach $conName (sort keys %translatedConstraintHash) { $BStree =~ s/($translatedConstraintHash{$conName})/$1$BStotal{$conName}/; } # insert PHBS values $PHBStree = $translatedTree; foreach $conName (sort keys %translatedConstraintHash) { $PHBStree =~ s/($translatedConstraintHash{$conName})/$1$PHBStotal{$conName}/; } # translate <>'s back to ()'s $PBStree =~ tr/[<>]/[\(\)]/; $BStree =~ tr/[<>/[\(\)]/; $PHBStree =~ tr/[<>]/[\(\)]/; # write out PBS tree files into phylip format $outputFileName = $inputTreeFileName; $outputFileName =~ s/\..*$/\.PBS\.phylip\.tre/; open (outputFile, ">$outputFileName"); print outputFile "$PBStree;\n"; close (outputFile); # write out BS tree files into phylip format $outputFileName = $inputTreeFileName; $outputFileName =~ s/\..*$/\.BS\.phylip\.tre/; open (outputFile, ">$outputFileName"); print outputFile "$BStree;\n"; close (outputFile); # write out PHBS tree files into phylip format $outputFileName = $inputTreeFileName; $outputFileName =~ s/\..*$/\.PHBS\.phylip\.tre/; open (outputFile, ">$outputFileName"); print outputFile "$PHBStree;\n"; close (outputFile); ###### ILD if ($runILD == 1) { $homPartReps = 100; $hsearchReps = 10; # remove the "_all" charset from charsetHash delete $charsetHash{"_all"}; # write pair-wise ILD test paup command file # perform pair-wise ILD test print "\tNow performing pair-wise ILD tests\n"; # create and instantiate homogeneity-partition test file $homTestFileName = "$directoryILD/\_homPart\_command.paup"; open (homTestFile, ">$homTestFileName"); # craft nexus file for performing test print homTestFile "#NEXUS\n\n"; # start paup block print homTestFile "Begin Paup;\n"; # execute the nexus data file print homTestFile "execute $nexusFileName;\n"; print homTestFile "set increase = auto;\n"; # create each partition test --> send output to log file # go through each partition in the partitionHash foreach $partitionOne (sort keys %charsetHash) { # process each pairwise partition comparison $processPar = 0; foreach $partitionTwo (sort keys %charsetHash) { # create log file name $homPartLogFile = "$directoryILD\/$partitionOne\_$partitionTwo\_homTest\.log"; # only process partitions AFTER current partition if ($processPar == 1) { # only run the pht if remaining taxa is greater than 4 if (exists ($missingTaxaList{"missing\_$partitionOne"}) || exists ($missingTaxaList{"missing\_$partitionTwo"})) { $leftTaxa = $numTaxa - $missingTaxaListCount{"missing\_$partitionOne"} - $missingTaxaListCount{"missing\_$partitionTwo"}; } else { $leftTaxa = $numTaxa; } if ($leftTaxa >= 4) { # print out comment lines to partition lest print homTestFile "\n"; print homTestFile "[partition homogeneity test -- $partitionOne vs $partitionTwo]\n"; # exclude all the characters print homTestFile "exclude all;\n"; # include only partition characters print homTestFile "include "; # specifify partition character blocks print homTestFile "$partitionOne "; print homTestFile "$partitionTwo"; print homTestFile ";\n"; # include all taxa print homTestFile "restore all;\n"; # remove missing taxa if (exists ($missingTaxaList{"missing\_$partitionOne"}) || exists ($missingTaxaList{"missing\_$partitionTwo"})) { print homTestFile "delete "; if (exists ($missingTaxaList{"missing\_$partitionOne"})) { print homTestFile "missing\_$partitionOne "; } if (exists ($missingTaxaList{"missing\_$partitionTwo"})) { print homTestFile "missing\_$partitionTwo"; } print homTestFile ";\n"; } # start log file print homTestFile "log file=$homPartLogFile replace=yes start=yes;\n"; # run partition homogeneity test print homTestFile "hompart partition=$partitionOne\_$partitionTwo nreps=$homPartReps seed=14 search=heuristic / nreps=$hsearchReps;\n"; # stop log file print homTestFile "log stop;\n"; } # otherwise, make dummy partition homogeneity result file else { # print out comment lines to partition lest print homTestFile "\n"; print homTestFile "[partition homogeneity test -- $partitionOne vs $partitionTwo]\n"; print homTestFile "[ *** Skipped because taxa count is only $leftTaxa (i.e., less than 4) *** ]\n"; open (dummyFile, ">$homPartLogFile"); print dummyFile " P value = 1 - (0/100) = NA\n"; print dummyFile "DUMMY FILE FOR $partitionOne vs $partitionTwo\n"; print dummyFile "*** Skipped because taxa count is only $leftTaxa (i.e., less than 4) *** \n"; close (dummyFile); } } if ($partitionOne eq $partitionTwo) { $processPar = 1; } } } # quit paup print homTestFile "\n\nquit /warnTsave=no;\n"; # close paup block print homTestFile "End;\n"; # close part homTest file & execute partition-homogeneity test close (homTestFile); #`$paupApp $homTestFileName`; system ("$paupApp $homTestFileName"); #### Tabulate & Create ILD matrix print "\tTabulating ILD results\n"; # retrieve all the p-values for all the partition tests @homTestall = split (/\n/, `grep \"P value\" $directoryILD/*homTest.log`); # clear the partitionPvalueHash %partitionPvalueHash = (); # parse out and store the homPart results foreach $homTest (@homTestall) { # clean out directory information $homTest =~ s/$directoryILD\///; # parse out the parts of the line @homTestParts = split (/\=/, $homTest); $pvalue = pop (@homTestParts); pop (@homTestParts); $homTestNames = pop (@homTestParts); # get the names of the partitions @partNames = split (/_/, $homTestNames); pop (@partNames); $partOne = pop (@partNames); $partTwo = pop (@partNames); # load into partitionPvalueHash $partitionPvalueHash{$partOne}{$partTwo} = $pvalue; # set self partitions to *'s $partitionPvalueHash{$partOne}{$partOne} = "1"; $partitionPvalueHash{$partTwo}{$partTwo} = "1"; } # create and instantiate output matrix file $outputMatrixFileName = $nexusFileName; $outputMatrixFileName =~ s/\..*$/\_ILDmatrix\.txt/; open (outputMatrixFile, ">$outputMatrixFileName"); # create and instantiate output hash file #$outputHashFileName = $nexusFileName; #$outputHashFileName =~ s/\..*$/\_ILDhash\.txt/; #open (outputHashFile, ">$outputHashFileName"); # initialize partitionPair hash %partitionPair = (); # print out header row for output matrix foreach $partition (sort keys %partitionPvalueHash) { print outputMatrixFile "\t$partition"; } print outputMatrixFile "\n"; # print out matrix file & load values into partitionPair hash foreach $partitionOne (sort keys %partitionPvalueHash) { print outputMatrixFile "$partitionOne\t"; foreach $partitionTwo (sort keys %{$partitionPvalueHash{$partitionOne}}) { print outputMatrixFile "$partitionPvalueHash{$partitionOne}{$partitionTwo}\t"; # symmetrically add the p-values to the partitionPair hash file #print outputHashFile "\$partitionPvalueHash{\"$partitionOne\"}{\"$partitionTwo\"} = \"$partitionPvalueHash{$partitionOne}{$partitionTwo}\";\n"; #print outputHashFile "\$partitionPvalueHash{\"$partitionTwo\"}{\"$partitionOne\"} = \"$partitionPvalueHash{$partitionOne}{$partitionTwo}\";\n"; } print outputMatrixFile "\n"; } close (outputMatrixFile); #close (outputHashFile); } } #s## #s getNodes #s read out nodes within parenthetical statements #s and report all nodes as a single array #s #s original code for finding quotes #s (modified only for aesthetics) #s by T. Christiansen sub getNodes { local($_, $qchars) = @_; local($xlate); if ($qchars =~ tr/\173\175/\373\375/) { $xlate++; tr/\173\175/\373\375/; } local($qL, $qR); # left and right quote chars, like `' or () local($quote_level); # current quote level local($max_quote); # deepest we've gotten local($qstring); # tmp space for quote local(@quotes); # list of quotes to return local($d) = '\$'; # not sure why this must be here local($b) = '\\'; # nor this local(@done); # which quotes we've finished so far die "need two just quote chars" if length($qchars) != 2; $qL = substr($qchars, 0, 1); $qR = substr($qchars, 1, 1); s/\\(.)/"\201".ord($1)."\202"/eg; # protect from backslashes $max_quote = $quote_level = $[-1; while ( /[$qchars]/ ) { if ($& eq $qL) { do { ++$quote_level; } while $done[$quote_level]; s/$b$qL/\${QL${quote_level}}/; $max_quote = $quote_level if $max_quote < $quote_level; } elsif ($& eq $qR) { s/$b$qR/\${QR${quote_level}}/; $done[$quote_level]++; do { --$quote_level; } while $done[$quote_level]; } else { die "unexpected quot char: $&"; } } for ($quote_level = $[; $quote_level <= $max_quote; $quote_level++) { ($qstring) = /${d}\{QL$quote_level\}([^\000]*)${d}\{QR$quote_level}/; $qstring =~ s/\${QL\d+\}/$qL/g; $qstring =~ s/\${QR\d+\}/$qR/g; $qstring =~ s/\201(\d+)\202/pack('C',$1)/eg; $quotes[$quote_level] = $qstring; } grep (tr/\373\375/\173\175/, @quotes) if $xlate; @quotes; }