#! /usr/bin/perl my $dir_prefix = "/var/www/html/search/data"; #my $dir_prefix = "/cellar/users/mdaly/cellcircuits/trunk/data"; my $hyper_p = "/cellar/users/cworkman/src/hypergeometric_Pvalue/hyper_p"; my $genes_beneath_terms_FILE = "n_genes_beneath_BY_GO_term_accession.tab"; my $usage=< <$genes_beneath_terms_FILE> + This program operates on gl files in: $dir_prefix/pub-dir-name/gl/*.gl What is a gl file? -A file with one gene per line, one file per model. -At the top of the file is some pertinent summary information prefixed with \#. -Following this \#-prefixed preamble is the list of genes in the model, one gene per line. -If there is more than one organism\'s genes represented in the model, then the organisms genes will be separated by | e.g. line1:#sif= ... line2:#Saccharomyces cerevisiae|Drosophila melanogaster line3:#... line4:#... line5:GCN4|CG12345 line6:GAL4|CG67890 ... where the 'line:' is not actually included in the real file, but is included here to more clearly illustrate the format. USG die $usage if(@ARGV < 3); use DBI; my $db = shift @ARGV; my $genes_beneath_terms_FILE = shift @ARGV; map { $pub_dir_names->{$_}++; } @ARGV; my $server = 'localhost'; my $username = 'mdaly'; my $password = 'mdalysql'; my $dbh = DBI->connect("dbi:mysql:$db", $username, $password); my $get_species_id = qq{ SELECT species.id FROM species WHERE species.genus=? AND species.species=? }; my $get_species_id_STH = $dbh->prepare($get_species_id); my $get_terms_of_gene_symbol = qq{ SELECT DISTINCT gene_product.id as gene_product_id, gene_product.symbol as symbol, term.acc as term_acc, term.name as term_name, term.term_type as term_type, species.genus as genus, species.species as species FROM gene_product, association, graph_path, term, species WHERE gene_product.species_id = species.id AND association.gene_product_id = gene_product.id AND graph_path.term2_id = association.term_id AND graph_path.term1_id = term.id AND species.genus = ? AND species.species = ? AND gene_product.symbol = ? }; my $get_terms_of_gene_symbol_STH = $dbh->prepare($get_terms_of_gene_symbol); my $get_terms_of_gene_synonym = qq{ SELECT DISTINCT gene_product.id as gene_product_id, gene_product.symbol as symbol, gene_product_synonym.product_synonym as synonym, term.acc as term_acc, term.name as term_name, term.term_type as term_type, species.genus as genus, species.species as species FROM gene_product, gene_product_synonym, association, graph_path, term, species WHERE gene_product_synonym.gene_product_id = gene_product.id AND gene_product.species_id = species.id AND association.gene_product_id = gene_product.id AND graph_path.term2_id = association.term_id AND graph_path.term1_id = term.id AND species.genus = ? AND species.species = ? AND gene_product_synonym.product_synonym = ? }; my $get_terms_of_gene_synonym_STH = $dbh->prepare($get_terms_of_gene_synonym); my $get_ancestors_of_term = qq{ SELECT p.name as term_name, p.acc as term_acc, p.term_type as term_type FROM graph_path as g INNER JOIN term AS t ON (t.id = g.term2_id) INNER JOIN term AS p ON (p.id = g.term1_id) INNER JOIN term2term AS r ON (r.term2_id = p.id) WHERE g.distance > 0 AND t.acc = ? }; my $get_ancestors_of_term_STH = $dbh->prepare($get_ancestors_of_term); my $get_tacc_from_tname = qq{ SELECT acc FROM term WHERE name = ? }; my $get_tacc_from_tname_STH = $dbh->prepare($get_tacc_from_tname); my @root_term_names = qw( biological_process cellular_component molecular_function ); foreach my $rtn (@root_term_names){ $get_tacc_from_tname_STH->bind_param(1,$rtn,{TYPE=>12}); $get_tacc_from_tname_STH->execute(); my $tacc; my $row = $get_tacc_from_tname_STH->fetchrow_arrayref(); if(scalar(@$row) > 1){ print STDERR "ERROR: more than term.acc corresponds " . "to term.name = $rtn\n"; exit; } elsif(scalar(@$row) == 0){ print STDERR "ERROR: no term.acc corresponds " . "to term.name = $rtn\n"; exit; } else{ $tacc = $row->[0]; $acc_by_type->{$rtn} = $tacc; print "acc_by_type->{$rtn} : $acc_by_type->{$rtn}\n"; } } @outfile_data_fields = qw( pval n_genes_in_model_with_term n_genes_in_model n_genes_with_term n_genes_in_GO species_id genus species term_acc term_type term_name genes_in_model_with_term ); print "genes_beneath_terms_FILE: $genes_beneath_terms_FILE\n"; my $org_acc_n_genes_beneath = read_genes_beneath_file($genes_beneath_terms_FILE); #my $org_acc_n_genes_beneath = read_genes_beneath_file($genes_beneath_terms_FILE); #foreach my $k1 (keys %{ $org_acc_n_genes_beneath }){ # foreach my $k2 (keys %{ $org_acc_n_genes_beneath->{$k1} }){ # print "org_acc_n_genes_beneath->{$k1}{$k2} : $org_acc_n_genes_beneath->{$k1}{$k2}\n" # if($org_acc_n_genes_beneath->{$k1}{$k2} < 1); # } #} #exit; $cwd = `pwd`; chomp $cwd; print "cwd = $cwd\n"; my $count = 0; foreach my $pub_dir_name (sort keys %{ $pub_dir_names }) { my @dp = split(/\//,$pub_dir_name); my $pub = shift @dp; my $gl_dir = ""; if(scalar(@dp)>=1){ my $path = join "/", @dp; $gl_dir = "$dir_prefix/$pub/gl/$path"; } else{ $gl_dir = "$dir_prefix/$pub/gl"; } chdir "$gl_dir" or die "Cannot cd to $gl_dir: $!\n"; my @gl_files = split "\n", `ls *.gl`; my $pub_dir_str = join ".", $pub,@dp; #my $missing_gene_errorlog = "$cwd/missing_gene_errorlog.$pub.$$"; #if(-e $missing_gene_errorlog){ # open(MISSING, ">> $missing_gene_errorlog") # or die "Cannot open $missing_gene_errorlog: $!\n"; # } # else{ # open(MISSING, "> $missing_gene_errorlog") # or die "Cannot open $missing_gene_errorlog: $!\n"; # } #my @gl_files = (); #push @gl_files, "variant35.gl"; for my $gl_file (@gl_files) { my $gl_file_path = "$gl_dir/$gl_file"; my ($genes) = parse_gl_file($gl_file_path); #genes->{organism}{gene_name} $count++; print STDERR "$count\t$pub_dir_name\t$gl_file\n"; my $results_file = "$cwd/results/$pub_dir_name/${gl_file}.enrichment"; #my $res_file_name = join "_", "$pub_dir_str", "${gl_file}.enrichment"; #my $results_file = "$cwd/results/enrichment-files/$res_file_name"; open(RESULTS, "> $results_file") or die "Cannot open $results_file: $!\n"; my $n_genes_in_model = 0; foreach my $org (sort keys %{ $genes }) { my($genus,$species) = split(/\s/, $org); my $species_id = get_species_id($genus,$species); my ($genes_in_model,#ref-to-hash genes_in_model->{gene_product_id} = symbol $org_tacc_gid, #org_tacc_gid->{org}{tacc}{gene_name}++ $org_tacc #org_tacc->{org}{tacc} = "ttype\ttname" ) = get_terms_annotated_to_genes($genes,$org,$genus,$species, \*RESULTS,$gl_file); $n_genes_in_model = scalar(keys %{ $genes_in_model }); #printf "%s\n", join "\t", @outfile_data_fields; printf RESULTS "%s\n", join "\t", @outfile_data_fields; foreach my $tacc (keys %{ $org_tacc_gid->{$org} }){ my @genes_in_model_with_term = keys %{ $org_tacc_gid->{$org}{$tacc} }; my $n_genes_in_model_with_term = scalar(@genes_in_model_with_term); ###### FILTER OUT IF OVERLAP IS <= 1 RIGHT HERE ####### #if($n_genes_in_model_with_term > 1) { my ($ttype,$tname) = split(/\t/,$org_tacc->{$org}{$tacc}); my $n_genes_in_GO = $org_acc_n_genes_beneath->{$org}{$acc_by_type->{$ttype}}; my $n_genes_with_term = $org_acc_n_genes_beneath->{$org}{$tacc}; my $hyperp_input = join " ", $n_genes_in_model_with_term, $n_genes_in_model, $n_genes_with_term, $n_genes_in_GO; my $pval = `$hyper_p $hyperp_input\n`; chomp $pval; $pval = sprintf("%0.6e", $pval); #printf STDERR "$pval\t$n_genes_in_model_with_term\t" # . "$n_genes_in_model\t$n_genes_with_term\t" # . "$n_genes_in_GO\t$species_id\t$genus\t$species\t" # . "$tacc\t$ttype\t\"$tname\"\t" # . "%s\n", join "\t", sort @genes_in_model_with_term; printf RESULTS "$pval\t$n_genes_in_model_with_term\t" . "$n_genes_in_model\t$n_genes_with_term\t" . "$n_genes_in_GO\t$species_id\t$genus\t$species\t" . "$tacc\t$ttype\t\"$tname\"\t" . "%s\n", join "\t", sort @genes_in_model_with_term; } } close(RESULTS); } #close(MISSING); } sub get_terms_annotated_to_genes { my ($gene_list,$organism,$genus,$species,$ofh,$gl_name) = @_; my $genes_in_model = {}; #genes_in_model->{gene_product_id} = symbol my $org_tacc_gid = {}; #org_tacc_gid->{org}{tacc}{gid}++ my $org_tacc = {}; #org_tacc->{org}{tacc} = "ttype\ttname" foreach my $gene (keys %{ $gene_list->{$organism} }) { my $tmp_terms = {}; ($tmp_terms,$genes_in_model) = get_terms($gene,$genus,$species, $genes_in_model,$ofh,$gl_name); $tmp_terms = get_ancestors($tmp_terms); foreach my $ttype (keys %{ $tmp_terms }){ foreach my $tacc (keys %{ $tmp_terms->{$ttype} }) { $org_tacc_gid->{$organism}{$tacc}{$gene}++; $org_tacc->{$organism}{$tacc} = "$ttype\t$tmp_terms->{$ttype}{$tacc}"; } } } return ($genes_in_model, $org_tacc_gid, $org_tacc); } sub get_terms { my ($gene,$genus,$species,$genes_in_model,$ofh,$gl_name) = @_; #### CLOOGE!!! ADDS _HUMAN ONTO END OF ALL HUMAN GENES!!! #### if(($genus=~/homo/i) && ($species=~/sapiens/i)){ $gene .= "_HUMAN"; } $get_terms_of_gene_symbol_STH->bind_param(1, $genus, {TYPE=>12}); $get_terms_of_gene_symbol_STH->bind_param(2, $species, {TYPE=>12}); $get_terms_of_gene_symbol_STH->bind_param(3, $gene, {TYPE=>12}); $get_terms_of_gene_symbol_STH->execute(); my $tmp_terms = {}; while(my $Ref = $get_terms_of_gene_symbol_STH->fetchrow_hashref()) { next if($Ref->{term_name} eq 'all'); unless($tmp_terms->{ $Ref->{term_type} }{ $Ref->{term_acc} }) { $tmp_terms->{ $Ref->{term_type} }{ $Ref->{term_acc} } = $Ref->{term_name}; } unless(exists $genes_in_model->{ $Ref->{gene_product_id} }) { $genes_in_model->{ $Ref->{gene_product_id} } = $Ref->{symbol}; } } if(scalar(keys %{ $tmp_terms }) == 0){ $get_terms_of_gene_synonym_STH->bind_param(1, $genus, {TYPE=>12}); $get_terms_of_gene_synonym_STH->bind_param(2, $species, {TYPE=>12}); $get_terms_of_gene_synonym_STH->bind_param(3, $gene, {TYPE=>12}); $get_terms_of_gene_synonym_STH->execute(); my $synonyms = {}; while(my $Ref = $get_terms_of_gene_synonym_STH->fetchrow_hashref()) { next if($Ref->{term_name} eq 'all'); unless($tmp_terms->{ $Ref->{term_type} }{ $Ref->{term_acc} }) { $tmp_terms->{ $Ref->{term_type} }{ $Ref->{term_acc} } = $Ref->{term_name}; } unless(exists $genes_in_model->{ $Ref->{gene_product_id} }) { $genes_in_model->{ $Ref->{gene_product_id} } = $Ref->{symbol}; $synonyms->{$gene} = $Ref->{symbol}; } } if(scalar(keys %{ $tmp_terms }) > 0){ print $ofh "#USING $synonyms->{$gene} instead of its synonym " . "$gene in $genus $species\n"; } elsif(scalar(keys %{ $tmp_terms }) == 0){ print $ofh "#NOT FOUND: $gene in $genus $species\n"; #print MISSING "#NOT FOUND: $gl_name: $gene in $genus $species\n"; } } return ($tmp_terms,$genes_in_model); } sub get_ancestors { my ($terms) = @_; foreach my $ttype (keys %{ $terms }){ foreach my $tacc (keys %{ $terms->{$ttype} }) { $get_ancestors_of_term_STH->bind_param(1, $tacc, {TYPE=>12}); $get_ancestors_of_term_STH->execute(); while(my $Ref = $get_ancestors_of_term_STH->fetchrow_hashref) { next if($Ref->{term_name} eq 'all'); unless(exists $terms->{$Ref->{term_type}}{ $Ref->{term_acc} }) { $terms->{$Ref->{term_type}}{ $Ref->{term_acc} } = $Ref->{term_name}; } } } } return $terms; } sub get_species_id { my ($genus,$species) = @_; $get_species_id_STH->bind_param(1, $genus, {TYPE=>12}); $get_species_id_STH->bind_param(2, $species, {TYPE=>12}); $get_species_id_STH->execute(); my $species_id; my $row = $get_species_id_STH->fetchrow_arrayref(); if(scalar(@$row) > 1){ print STDERR "ERROR: more than one species id corresponds " . "to genus=$genus species=$species\n"; exit; } elsif(scalar(@$row) == 0){ print STDERR "ERROR: no species id corresponds " . "to genus=$genus species=$species\n"; exit; } else{ $species_id = $row->[0]; } return $species_id; } sub parse_gl_file { my ($f) = @_; my $genes = {}; my @orgs = (); my $n_org; open(F, "$f") or die "Cannot open $f: $!\n"; while(){ chomp; if(/^\#/){ next if($. != 2); s/^\#//; @orgs=split(/\|/); $n_org = scalar(@orgs); next; } if($n_org > 1){ my @l=split(/\|/); for my $i (0..$#l){ $genes->{$orgs[$i]}{$l[$i]}++; } } elsif($n_org == 1){ $genes->{$orgs[0]}{$_}++; } else{ print STDERR "ERROR in readGeneList(). n_org <= 0\n"; exit; } } close(F); return $genes; } sub read_genes_beneath_file { my($file) = @_; my $org_acc_n_genes_beneath = {}; open(FILE, "< $file") or die "Cannot open $file: $!\n"; while(){ chomp; my ($acc,$n_genes_beneath,$org) = split(/\t/,$_); unless($n_genes_beneath == 0){ $org_acc_n_genes_beneath->{$org}{$acc} = $n_genes_beneath; } } close(FILE); return $org_acc_n_genes_beneath; }