// at this moment two arrays are available: $seqs (with sequences) and $seq_names (with name of sequences)
";
// PRINT TABLE IN THE TOP WITH SEQUENCE
// show sequence in the top when requested (showcode==1)
if ($_POST["showcode"]==1){
print "
";
foreach($sequence as $key => $val){
$seq1=$sequence[$key]["seq"] ;
if ($sequence[$key]["name"]!="")
{
print "
| >".$sequence[$key]["name"]." |
\n";
if ((substr_count($seqs,"A")+substr_count($seqs,"C")+substr_count($seqs,"G")+substr_count($seqs,"T"))>(strlen($seqs)/2)){
// if A+C+G+T is at least half of the sequence, it is a DNA
$sequence = extract_sequences($text);
}else{
// else is protein
// calculate nucleotide composition
$aminoacid_content=aminoacid_content($sequence);
// calculate molecular weight of protein
$molweight=protein_molecular_weight($sequence,$aminoacid_content);
$result.="
Molecular weight:
$molweight Daltons";
$abscoef=molar_absorption_coefficient_of_prot($sequence,$aminoacid_content,$molweight);
$result.="
Molar Absorption Coefficient at 280 nm:
".round($abscoef,2);
protein_molecular_weight ($sequence,$aminoacid_content);
}
}
//$nuc["G"]= substr_count($seq1,"G");
//$nuc["T"]=substr_count($seq1,"T");
//$nuc["A"]=substr_count($seq1,"A");
//$nuc["C"]=substr_count($seq1,"C");
//Anhydrous Molecular Weight = (An x 313.21) + (Tn x 304.2) + (Cn x 289.18) + (Gn x 329.21) - 61.96
print "
Lengh of code: ".strlen($seq1)."\n";//."\tG+C=".floor((substr_count($seq1,"G")+substr_count($seq1,"C"))*100/strlen($seq1))."%\n A:". $nuc["A"]."\t T:". $nuc["T"]."\t G:". $nuc["G"]."\t C:". $nuc["C"]."\n MOLECULAR WEIGHT=".floor(((substr_count($seq1,"A")*313.21)+(substr_count($seq1,"T")*304.2)+(substr_count($seq1,"G")*289.18)+(substr_count($seq1,"C")*329.21))-69.96)."D\n"; // print "molecular weight: ".$molweight($sequence)." Daltons\n"; // print "molecular weight: ".protein_molecular_weight ($sequence,$aminoacid_content)+$molweight."\n";
$s=0;
$rtop=chunk_split($seq1,10,' ');
while ($s<=strlen($rtop)){
$rline=substr ($rtop, $s, 110);
print "$rline ";
$s=$s+110;
if (strlen($rline)==110){print $s/1.1;}
print "\n";
}
print " | \n";
}
print " | |
\n";
}
//###############################################
// COMPUTE DISTANCES
// GET METHOD
if ($_POST["method"]=="euclidean"){ // EUCLIDEAN DISTANCE
// COMPUTE OLIGONUCLEOTIDE FREQUENCIES
foreach($seqs as $key => $val){
// to compute oligonucleotide frequencies, both strands are used
$seq_and_revseq=$val." ".RevComp($val);
$oligo_array[$key]= oligo_frequencies_standar($seq_and_revseq,$_POST["len"]);
}
// COMPUTE DISTANCES AMONG SEQUENCES
// by computing Euclidean distance
// standarized oligonucleotide frequencies in $oligo_array are used, and distances are stored in $data array
foreach($seqs as $key => $val){
foreach($seqs as $key2 => $val2){
if ($key>=$key2){continue;}
$data[$key][$key2]= Euclid_distance($oligo_array[$key],$oligo_array[$key2],$_POST["len"]);
}
}
}else{
// COMPUTE OLIGONUCLEOTIDE FREQUENCIES
foreach($seqs as $key => $theseq){
$oligo_array[$key]= compute_zscores_for_tetranucleotides($theseq);
}
// COMPUTE DISTANCES AMONG SEQUENCES
// by computing Pearson distance
// standarized oligonucleotide frequencies in $oligo_array are used, and distances are stored in $data array
foreach($seqs as $key => $val){
foreach($seqs as $key2 => $val2){
if ($key>=$key2){continue;}
$data[$key][$key2]= Pearson_distance($oligo_array[$key],$oligo_array[$key2]);
}
}
}
// DISPLAY TABLE WITH DISTANCES
print "
\n| Distances | ";
for($i=1;$i print "$i | ";
}
print "
\n";
foreach($data as $key => $val){
print "| $key | ";
for($i=1;$i if ($data[$key][$i]){
$aa=floor(255*($data[$key][$i]));
print "".$data[$key][$i]." | ";
}else{
print " | ";
}
}
print "
\n";
}
print "
\n";
// DISPLAY NAME OF SEQUENCES
print "
| \n Name of sequences\n \n"; foreach($seq_name as $n => $name){ print " $n\t$name\n"; } print " |
\n";
// DISPLAY SELECTED OLIGONUCLEOTIDE LENGTH
//print "Length of oligoncleotides for comparison: ".$_POST["len"];
// NEXT LINES WILL PERFORM UPGMA CLUSTERING
// in each loop, array $data is reduced (one case per loop)
while (sizeof($data)>1){
$min=min_array($data); // global variables are created: $x, $y, $min, $cases
$comp[$x][$y]=$min;
$data=new_array($data,$cases,$x,$y);
}
$min=min_array($data);
$comp[$x][$y]=$min;
// end of clustering
// array $comp stores the important data
// $textcluster is the results of the cluster as text.
// p.e.: ((3,4),7),(((5,6),1),2)
$textcluster="$x,$y";
print "
Clustering method: UPGMA
$textcluster
";
// $max is the distance in the last clustering step (the root of the dendrogram)
$max=$a[$x][$y];
// CREATE THE IMAGE WITH THE DENDROGRAM
create_dendrogram ($textcluster,$comp,$max,$_POST["method"],$_POST["len"]);
// SHOW DENDROGRAM
print '
';
// SHOW TIME REQUIRED FOR COMPUTING
$timetotal=date("U")-$timestart;
print "
Computed in $timetotal seconds
";
// #######################################################################################
// ############################## FUNCTIONS ####################################
// #######################################################################################
function get_cases($a){
$done="";
foreach($a as $key => $val){
$done.="#$key";
foreach($a[$key] as $key2 =>$val2){
$done.="#$key2";
}
}
$cases=preg_split("/#/",$done,-1,PREG_SPLIT_NO_EMPTY);
$cases=array_unique($cases);
sort($cases);
return $cases;
}
//#######################################################################################
function new_array($a,$cases,$x,$y){
$cases=get_cases($a);
for($j=0; $j $key=$cases[$j];
// next 3 lines are required in windows for correct comparison
settype($key, "string");
settype($x, "string");
settype($y, "string");
if ($key==$x or $key==$y){continue;}
if ($a[$key][$x]!=""){
if ($a[$key][$y]!=""){$temp_a[$key]["($x,$y)"]=($a[$key][$x]+$a[$key][$y])/2;}
if ($a[$x][$key]!=""){$temp_a[$key]["($x,$y)"]=($a[$key][$x]+$a[$x][$key])/2;}
if ($a[$y][$key]!=""){$temp_a[$key]["($x,$y)"]=($a[$key][$x]+$a[$y][$key])/2;}
}else{
if ($a[$key][$y]!=""){
if ($a[$x][$key]!=""){$temp_a[$key]["($x,$y)"]=($a[$key][$y]+$a[$x][$key])/2;}
if ($a[$y][$key]!=""){$temp_a[$key]["($x,$y)"]=($a[$key][$y]+$a[$y][$key])/2;}
}else{
if ($a[$y][$key]!=""){$temp_a[$key]["($x,$y)"]=($a[$y][$key]+$a[$y][$key])/2;}
}
}
for($i=$j+1; $i $key2=$cases[$i];
settype($key2, "string");
if ($key==$key2 or $key2==$x or $key2==$y){continue;}
if ($a[$key][$key2]!=""){$temp_a[$key][$key2]=$a[$key][$key2];}
if ($a[$key2][$key]!=""){$temp_a[$key][$key2]=$a[$key2][$key];}
}
}
return $temp_a;
}
//#######################################################################################
function min_array($a){
global $x, $y, $min;
global $cases; // an array for cases
$str_cases="";
$min=1000000;
foreach($a as $key =>$val){
$str_cases.="#$key";
foreach($a[$key] as $key2 =>$val2){
if ($val==""){continue;}
$str_cases.="#$key2";
if ($val2<$min){
$min=$val2;
$x=$key;
$y=$key2;
}
}
}
$cases=preg_split("/#/",$done,-1,PREG_SPLIT_NO_EMPTY);
$cases=array_unique($cases);
sort($cases);
$min2=$min/2;
return $min;
}
//#######################################################################################
function create_dendrogram($str,$comp,$max,$method,$len){
$w=20; //height for each line (case)
$str=preg_replace("/\(|\)/","",$str).",";
$a=preg_split("/,/",$str,-1,PREG_SPLIT_NO_EMPTY);
$rows=sizeof($a);
$width=600; // width of scale from 0 to 2
$im = @imagecreatetruecolor($width*1.2, $rows*$w+40) or die("Unable to start image. It is GD available?");
$white =imagecolorallocate($im, 255, 255, 255);
$black =imagecolorallocate($im, 0, 0, 0);
$red =imagecolorallocate($im, 255, 0, 0);
imagefilledrectangle($im,0,0,$width*1.2, $rows*$w+40,$white);
$y=$rows*$w; // vertical location
$f=$width; // multiplication factor
// lines for scale
$j=0.1;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black);
$j=0.2;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black);
$j=0.3;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black);
$j=0.5;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black);
$j=1.0;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, "1.0", $black);
$j=1.5;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black);
$j=2.0;
imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black);
imagestring($im, 1, log($j+1)*$f-8+20, $y+12, "2.0", $black);
// write into the image the numbers corresponding to cases
foreach($a as $n => $val){
if (strlen($val)==1){$val=" $val";}
imagestring($im,3, 5, $n*$w+5, $val, $black);
}
// WRITE LINES
foreach ($comp as $key => $val){
$pos1=$pos2=0;
foreach ($comp[$key] as $key2 => $val2){
// get position of case in the list
$keya=preg_replace("/\(|\)/","",$key);
$pos1=substr_count (" ,".substr ($str, 0,strpos(" ,".$str,",$keya,")),",")-0.4;
$keyb=preg_replace("/\(|\)/","",$key2);
$pos2=substr_count (" ,".substr ($str, 0,strpos(" ,".$str,",$keyb,")),",")-0.4;
if (substr_count($keya,",")>0){$pos1b=$pos1+substr_count($keya,",")/2;}else{$pos1b=$pos1;}
if (substr_count($keyb,",")>0){$pos2b=$pos2+substr_count($keyb,",")/2;}else{$pos2b=$pos2;}
// Position related data
$xkey1=$wherex[$key];
if ($xkey1==""){$xkey1=0;}
$xkey2=$wherex[$key2];
if ($xkey2==""){$xkey2=0;}
$max=max($xkey1,$xkey2);
$min=min($xkey1,$xkey2);
$xmax=$max+(($val2-($max))/2);
$val4=log($xmax+1)*$f;
$val4max=log($max+1)*$f;
$val4min=log($min+1)*$f;
// write lines
if ($wherex[$key]==$max){
imageline ($im, $val4max+20, $pos1b*$w, $val4+20, $pos1b*$w, $black); // ---
imageline ($im, $val4+20, $pos1b*$w, $val4+20, $pos2b*$w, $black); // |
imageline ($im, $val4min+20, $pos2b*$w, $val4+20, $pos2b*$w, $black); // ---
}else{
imageline ($im, $val4min+20, $pos1b*$w, $val4+20, $pos1b*$w, $black); // ---
imageline ($im, $val4+20, $pos1b*$w, $val4+20, $pos2b*$w, $black); // |
imageline ($im, $val4max+20, $pos2b*$w, $val4+20, $pos2b*$w, $black); // ---
}
$wherex["(".$key.",".$key2.")"]=$xmax;
}
}
imageline ($im, $val4+20, ($pos1b+$pos2b)*$w/2, $val4+40, ($pos1b+$pos2b)*$w/2, $black);
imageline ($im, 20, $y, $width*1.2, $y, $black);
if ($method=="euclidean"){
imagestring($im, 2, 5, $rows*$w+25, "Euclidean distance for $len bases long oligonucleotides.", $red);
}else{
imagestring($im, 2, 5, $rows*$w+25, "Pearson distance for z-scores of tetranucleotides.", $red);
}
imagestring($im, 2, $width*1, $rows*$w+25, "by phylopal", $black);
header( "Content-Type: image/png" );
imagepng($im,"image.png");
imagedestroy($im);
}
//######################## ###############################################################
function RevComp($code){
$code=strrev($code);
$code=str_replace("A", "t", $code);
$code=str_replace("T", "a", $code);
$code=str_replace("G", "c", $code);
$code=str_replace("C", "g", $code);
$code = strtoupper ($code);
return $code;
}
//#######################################################################################
function Euclid_distance($a,$b,$k){
// Wang et al, Gene 2005; 346:173-185
$c=sqrt(pow(2,$k))/pow(4,$k); // contant
$sum=0;
foreach($a as $key => $val){
$sum+= pow($val-$b[$key],2);
}
return $c*sqrt($sum);
}
//#######################################################################################
function Pearson_distance($vals_x,$vals_y){
// normal correlation
if (sizeof($vals_x)!= sizeof($vals_y)){return;}
$sum_x=0;
$sum_x2=0;
$sum_y=0;
$sum_y2=0;
$sum_xy=0;
$n=sizeof($vals_x);
foreach($vals_x as $key => $val){
$val_x=$val;
$val_y=$vals_y[$key];
$sum_x+=$val_x;
$sum_x2+=$val_x*$val_x;
$sum_y+=$val_y;
$sum_y2+=$val_y*$val_y;
$sum_xy+=$val_x*$val_y;
//print "$val_x\t$val_y\n";
}
// calculate regression
$regresion=($sum_xy-(1/$n)*$sum_x*$sum_y)/((sqrt($sum_x2-(1/$n)*$sum_x*$sum_x)*(sqrt($sum_y2-(1/$n)*$sum_y*$sum_y))));
if ($regresion>0.999999999){$regresion=1;} // round data
return (1-$regresion);
}
//#######################################################################################
function compute_zscores_for_tetranucleotides($theseq){
// as described by Teeling et al. BMC Bioinformatics 2004, 5:163.
$theseq.=" ".RevComp($theseq);
$i=0;
$len=strlen($theseq)-2+1;
while ($i<$len){
$seq=substr($theseq,$i,2);
$oligos2[$seq]++;
$i++;
}
$i=0;
$len=strlen($theseq)-3+1;
while ($i<$len){
$seq=substr($theseq,$i,3);
$oligos3[$seq]++;
$i++;
}
$i=0;
$len=strlen($theseq)-4+1;
while ($i<$len){
$seq=substr($theseq,$i,4);
$oligos4[$seq]++;
$i++;
}
$base_a=array("A","C","G","T");
$base_b=array("A","C","G","T");
$base_c=array("A","C","G","T");
$base_d=array("A","C","G","T");
$base_e=array("A","C","G","T");
$base_f=array("A","C","G","T");
// COMPUTE Z-SCORES FOR TETRANUCLEOTIDES
$i=0;
foreach($base_a as $key_a => $val_a){
foreach($base_b as $key_b => $val_b){
foreach($base_c as $key_c => $val_c){
foreach($base_d as $key_d => $val_d){
$exp[$val_a.$val_b.$val_c.$val_d] = ($oligos3[$val_a.$val_b.$val_c]*$oligos3[$val_b.$val_c.$val_d])/$oligos2[$val_b.$val_c];
$var[$val_a.$val_b.$val_c.$val_d] = $exp[$val_a.$val_b.$val_c.$val_d] *((($oligos2[$val_b.$val_c]-$oligos3[$val_a.$val_b.$val_c])*($oligos2[$val_b.$val_c]-$oligos3[$val_b.$val_c.$val_d]))/pow($oligos2[$val_b.$val_c],2));
$zscore[$i] = ($oligos4[$val_a.$val_b.$val_c.$val_d]-$exp[$val_a.$val_b.$val_c.$val_d])/sqrt($var[$val_a.$val_b.$val_c.$val_d]);
$i++;
}}}}
return $zscore;
}
//#######################################################################################
function oligo_frequencies_standar($cadena,$len_oligos){
$i=0;
$len_oligos=2;
$len=strlen($cadena)-$len_oligos+1;
while ($i<$len){
$seq=substr($cadena,$i,$len_oligos);
$oligos_internos[$seq]++;
$i++;
}
$base_a=array("A","C","G","T");
$base_b=array("A","C","G","T");
$base_c=array("A","C","G","T");
$base_d=array("A","C","G","T");
$base_e=array("A","C","G","T");
$base_f=array("A","C","G","T");
//para oligos de 2
if ($len_oligos==2){
foreach($base_a as $key_a => $val_a){
foreach($base_b as $key_b => $val_b){
if ($oligos_internos[$val_a.$val_b]){
$oligos[$val_a.$val_b] = $oligos_internos[$val_a.$val_b];
}else{
$oligos[$val_a.$val_b] = 0;
}
}}
}
$oligos=standar_frecuencies($oligos, $len_oligos);
return $oligos;
}
function standar_frecuencies($array, $m){
$sum=0;
foreach($array as $k => $v){
$sum+=$v;
}
$c=pow(4,$m)/$sum;
foreach($array as $k => $v){
$array[$k]= $c*$v;
}
return $array;
}
//###########################################################NEW###################################
function extract_sequences($text){
if (substr_count($text,">")==0){
$sequence[0]["seq"] = preg_replace ("/\W|\d/", "", strtoupper ($text));
}else{
$arraysequences=preg_split("/>/", $text,-1,PREG_SPLIT_NO_EMPTY);
$counter=0;
foreach($arraysequences as $key =>$val){
$seq1=substr($val,strpos($val,"\n"));
$seq1 = preg_replace ("/\W|\d/", "", strtoupper($seq1));
if (strlen($seq1)>0){
$sequence[$counter]["seq"] = $seq1;
$sequence[$counter]["name"]=substr($val,0,strpos($val,"\n"));
$counter++;
}
}
}
return $sequence;
}
######################################################################################################
//############################################################################
function remove_non_coding_prot($sequence) {
// change the sequence to upper case
$seq=strtoupper($sequence);
// remove non-coding characters([^ARNDCEQGHILKMFPSTWYVX\*])
$seq = preg_replace ("([^ARNDCEQGHILKMFPSTWYVX\*])", "", $seq);
return $sequence;
}
function aminoacid_content($sequence) {
$array=array("A"=>0,"R"=>0,"N"=>0,"D"=>0,"C"=>0,"E"=>0,"Q"=>0,"G"=>0,"H"=>0,"I"=>0,"L"=>0,
"K"=>0,"M"=>0,"F"=>0,"P"=>0,"S"=>0,"T"=>0,"W"=>0,"Y"=>0,"V"=>0,"X"=>0,"*"=>0);
for($i=0; $i $aa=substr($seq,$i,1);
$array[$aa]++;
}
return $array;
}
function molar_absorption_coefficient_of_prot($sequence,$aminoacid_content,$molweight) {
// Prediction of the molar absorption coefficient of a protein
// Pace et al. . Protein Sci. 1995;4:2411-23.
$abscoef = ($aminoacid_content["A"]*5500 + $aminoacid_content["Y"]*1490 + $aminoacid_content["C"]*125)/$molweight;
return $abscoef;
}
// molecular weight calculation
function protein_molecular_weight ($sequence,$aminoacid_content){
$molweight = $aminoacid_content["A"]*71.07; // for Alanine
$molweight+= $aminoacid_content["R"]*156.18; // for Arginine
$molweight+= $aminoacid_content["N"]*114.08; // for Asparagine
$molweight+= $aminoacid_content["D"]*115.08; // for Aspartic Acid
$molweight+= $aminoacid_content["C"]*103.10; // for Cysteine
$molweight+= $aminoacid_content["Q"]*128.13; // for Glutamine
$molweight+= $aminoacid_content["E"]*129.11; // for Glutamic Acid
$molweight+= $aminoacid_content["G"]*57.05; // for Glycine
$molweight+= $aminoacid_content["H"]*137.14; // for Histidine
$molweight+= $aminoacid_content["I"]*113.15; // for Isoleucine
$molweight+= $aminoacid_content["L"]*113.15; // for Leucine
$molweight+= $aminoacid_content["K"]*128.17; // for Lysine
$molweight+= $aminoacid_content["M"]*131.19; // for Methionine
$molweight+= $aminoacid_content["F"]*147.17; // for Phenylalanine
$molweight+= $aminoacid_content["P"]*97.11; // for Proline
$molweight+= $aminoacid_content["S"]*87.07; // for Serine
$molweight+= $aminoacid_content["T"]*101.10; // for Threonine
$molweight+= $aminoacid_content["W"]*186.20; // for Tryptophan
$molweight+= $aminoacid_content["Y"]*163.17; // for Tyrosine
$molweight+= $aminoacid_content["Z"]*99.13; // for Valine
$molweight+= 18.02; // water
$molweight+= $aminoacid_content["X"]*114.822; // for unkwon aminoacids, add avarage of all aminoacids
return $molweight;
}
########################################################################################################
function print_form(){
?>
|
PHYLO-PAL - This tool will compute distance between input sequences and clustering will be applied.
- Distances are shown in a table, and a dendrogram is displaied
|
Posted: 08:12, 4.04.2007 |
Kommentare (1) | Link |