Changeset 533 for trunk

Show
Ignore:
Timestamp:
07/03/11 21:15:52 (11 months ago)
Author:
AlexanderPico
Message:

major overhaul to how array id are collected; other minor improvements, including GO terms as attributes

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/dbbuilder/src/org/bridgedb/extract/Ensembl_ETL_Device_v18_local.pl

    r518 r533  
    4040use DBI; 
    4141use HashSpeciesList; 
     42use HashArrayList; 
    4243use lib '/home/socr/c/users2/apico/src/ensembl/modules'; 
    4344#use lib '/home/socr/c/users2/apico/src/ensembl-compara/modules'; 
     
    184185 
    185186} 
    186  
    187 close(SPECIES); 
    188187 
    189188## MENU OF SUPPORTED SPECIES 
     
    427426my $slice_adaptor = $registry->get_adaptor($species, "core", "slice"); 
    428427my $go_adaptor = $registry->get_adaptor("Multi", "Ontology", "GOTerm"); 
    429 my $probe_feature_adaptor = $registry->get_adaptor($species, "funcgen", "ProbeFeature"); 
    430 my $probe_adaptor = $registry->get_adaptor($species, "funcgen", "Probe"); 
    431 my $probe_set_adaptor = $registry->get_adaptor($species, "funcgen", "Probeset"); 
     428#my $probe_feature_adaptor = $registry->get_adaptor($species, "funcgen", "ProbeFeature"); 
     429#my $probe_adaptor = $registry->get_adaptor($species, "funcgen", "Probe"); 
     430#my $probe_set_adaptor = $registry->get_adaptor($species, "funcgen", "Probeset"); 
     431my $array_adaptor = $registry->get_adaptor($species, "funcgen", "Array");   
    432432my @dbas = @{Bio::EnsEMBL::Registry->get_all_DBAdaptors(-species => $species)}; 
    433433my $dbname = $dbas[0]->dbc->dbname();        # e.g., core_mus_musculus_42_36c 
     
    11901190                    \%Attributes);   
    11911191     
    1192     ## EXTRACT FUNCGEN INFORMATION 
    1193     if ($funcgen =~ /(Y|Yes)/i){ 
    1194         parse_ProbeFeatures($gene,  
    1195                         \%GeneTables, 
    1196                         \%Ensembl_GeneTables, 
    1197                         \%Attributes); 
    1198     } 
    1199  
    12001192    ## EXTRACT GENE STRUCTURE INFORMATINO 
    12011193#    parse_AllTranscripts($gene->get_all_Transcripts(),  
     
    12151207    if ($count % $purge_frequency == 0 || $count == ($collect_sample + $start_count - 1)){ 
    12161208        print "\n"; 
     1209 
     1210            ## AT THE END OF SAMPLE COLLECTION OR TOTAL GENOME COLLECTION 
     1211            ## BUT BEFORE THE LAST DATA LOAD INTO MYSQL 
     1212            if ($count == ($collect_sample + $start_count - 1)){ 
     1213                ## COLLECT MIRCOARRAY TABLES 
     1214                if ($funcgen !~ /(N|No)/i ){ 
     1215                    parse_ProbeFeatures( 
     1216                        \%GeneTables, 
     1217                        \%Ensembl_GeneTables 
     1218                        ); 
     1219                } 
     1220            } 
    12171221 
    12181222        ## LOAD DATA INTO MYSQL TABLES? 
     
    14741478} # END: loop through each gene in genome 
    14751479 
    1476  
    14771480print "\nDONE\n"; 
    14781481 
     
    17721775                $$GeneTables{GeneOntology}{$count.$dot.$subcount{GeneOntology}} = [$dbe_primary_id, $name, $namespace]; 
    17731776                $$Ensembl_GeneTables{GeneOntology}{$count.$dot.$subcount{GeneOntology}} = [$gene_stable_id, $dbe_primary_id]; 
     1777                $$Attributes{GeneOntology}{$count.$dot.$subcount{GeneOntology}.$dot.'1'} = [$dbe_primary_id, mysql_quotes( $$GeneTables{GeneOntology}{'NAME'}[1]), mysql_quotes('Term'), $name];  
    17741778                ++$subcount{GeneOntology}; 
    17751779            } 
     
    17831787                  my $term = $go_adaptor->fetch_by_accession($acc); 
    17841788                  my $name = mysql_quotes($term->name()); #e.g., plasma membrane 
    1785                   my $namespace = mysql_quotes($term->namespace()); # e.g., cellular component 
     1789                  my $namespace = mysql_quotes($term->namespace()); # e.g., cellular_component 
    17861790                  if ($namespace =~ /\'biological_process\'/){ 
    17871791                        $$GeneTables{GOslimBP}{$count.$dot.$subcount{GOslimBP}} = [$dbe_primary_id, $name]; 
    17881792                        $$Ensembl_GeneTables{GOslimBP}{$count.$dot.$subcount{GOslimBP}} = [$gene_stable_id, $dbe_primary_id]; 
     1793                        $$Attributes{GOslimBP}{$count.$dot.$subcount{GOslimBP}.$dot.'1'} = [$dbe_primary_id, mysql_quotes( $$GeneTables{GOslimBP}{'NAME'}[1]), mysql_quotes('Term'), $name];     
    17891794                        ++$subcount{GOslimBP}; 
    17901795                  } elsif ($namespace =~ /\'cellular_component\'/){ 
    17911796                        $$GeneTables{GOslimCC}{$count.$dot.$subcount{GOslimCC}} = [$dbe_primary_id, $name]; 
    17921797                        $$Ensembl_GeneTables{GOslimCC}{$count.$dot.$subcount{GOslimCC}} = [$gene_stable_id, $dbe_primary_id]; 
     1798                        $$Attributes{GOslimCC}{$count.$dot.$subcount{GOslimCC}.$dot.'1'} = [$dbe_primary_id, mysql_quotes( $$GeneTables{GOslimCC}{'NAME'}[1]), mysql_quotes('Term'), $name]; 
    17931799                        ++$subcount{GOslimCC}; 
    17941800                  } elsif ($namespace =~ /\'molecular_function\'/){ 
    17951801                        $$GeneTables{GOslimMF}{$count.$dot.$subcount{GOslimMF}} = [$dbe_primary_id, $name]; 
    17961802                        $$Ensembl_GeneTables{GOslimMF}{$count.$dot.$subcount{GOslimMF}} = [$gene_stable_id, $dbe_primary_id]; 
     1803                        $$Attributes{GOslimMF}{$count.$dot.$subcount{GOslimMF}.$dot.'1'} = [$dbe_primary_id, mysql_quotes( $$GeneTables{GOslimMF}{'NAME'}[1]), mysql_quotes('Term'), $name]; 
    17971804                        ++$subcount{GOslimMF}; 
    17981805                  } else { 
    17991806                        #garbage? 
     1807                        print "Unrecognized GO-slim namespace in $genus_species: $namespace\n"; 
    18001808                  } 
    18011809                } else { 
     
    21662174################################################################################################# 
    21672175sub parse_ProbeFeatures { 
    2168   my ($gene, $GeneTables, $Ensembl_GeneTables, $Attributes) = @_; 
     2176  #my ($gene, $GeneTables, $Ensembl_GeneTables, $Attributes) = @_; 
     2177  my ($GeneTables, $Ensembl_GeneTables) = @_; 
    21692178  my %subcount = (); 
    21702179  my %seen = (); 
     2180  my %arrayTable = getArrayTable(); 
    21712181 
    21722182  foreach my $key ( keys %$GeneTables) { 
    21732183      $subcount{$key} = 1; 
    2174       %{$seen{$key}} = (); 
    21752184  } 
    21762185 
    2177   ## Solution #1 only works after the efg database has been patched to key on gene ids 
    2178   ## See NathJohnson_DB_patch.sql 
    2179     #my $probe_features = $probe_adaptor->fetch_all_by_external_name($gene_stable_id); 
    2180   ## Solution #2 only works after API method has been added 
    2181   ## See NathJohnson_API_patch.txt 
    2182     #my @probe_features = @{$probe_feature_adaptor->fetch_all_by_linked_transcript_Gene($gene)}; 
    2183     my @probes = @{$probe_adaptor->fetch_all_by_linked_transcript_Gene($gene)}; 
    2184     my @probe_sets = @{$probe_set_adaptor->fetch_all_by_linked_transcript_Gene($gene)}; 
    2185     my @all_probes = (@probes, @probe_sets); 
    2186  
    2187     foreach my $probe (@all_probes) { 
    2188       #my $probe = $pf->probe(); 
    2189       my $array_list = $probe->get_all_Arrays(); 
    2190  
    2191       foreach my $array (@$array_list){ 
    2192         if (!$array) { 
     2186my @arrayList = (@{$arrayTable{$genus_species}});   
     2187foreach my $array_name (@arrayList){ 
     2188 
     2189print "fetching array: $array_name...\n"; 
     2190    my $array = $array_adaptor->fetch_by_name_vendor($array_name); 
     2191    my $p_dbname = mysql_quotes($array->vendor()); 
     2192 
     2193print "getting all probes...\n"; 
     2194my @plist = (); 
     2195  if ($p_dbname =~ /^\'AFFY/i) {   
     2196    @plist = @{$array->get_all_ProbeSets()}; 
     2197  } else { 
     2198    @plist = @{$array->get_all_Probes()};   
     2199  } 
     2200 
     2201my $pcount = $#plist; 
     2202print "probe count= $pcount\n"; 
     2203 
     2204    %{$seen{dbRel}} = (); 
     2205    foreach my $p (@plist) { 
     2206        my $p_display_id = 'null'; 
     2207        if ($p_dbname =~ /^\'AFFY/i) { # Affy uses probesets 
     2208                $p_display_id = $p->name();  
     2209        } else { 
     2210                $p_display_id = $p->get_probename($array_name); 
     2211        } 
     2212 
     2213        ## Handle case when list of of probe names is returned 
     2214        my @pnamelist = split(',', $p_display_id); 
     2215        foreach my $p_name (@pnamelist){         
     2216 
     2217        my $p_primary_id = mysql_quotes($p_name); 
     2218        my $p_release = mysql_quotes($array->name()); 
     2219 
     2220        %{$seen{Gene}} = (); 
     2221        my @dbeList = @{$p->get_all_DBEntries()}; 
     2222        foreach my $dbe (@dbeList){ 
     2223          my $dbe_dbname = $dbe->dbname(); 
     2224          my $gene = ""; 
     2225          if ($dbe_dbname =~ /core_Transcript$/){ 
     2226                $gene = $gene_adaptor->fetch_by_transcript_stable_id($dbe->primary_id());        
     2227          } else { 
    21932228                next; 
    2194         } 
    2195         my $pf_dbname = mysql_quotes($array->vendor()); 
    2196         my $pf_display_id = 'null'; 
    2197         if ($pf_dbname =~ /^\'AFFY/i) { # Affy uses probesets 
    2198                 $pf_display_id = mysql_quotes($probe->name()); ##probeset()->name()); 
    2199         } else { 
    2200                 $pf_display_id = mysql_quotes($probe->get_probename($array->name())); 
    2201         } 
    2202                  
    2203         my $pf_primary_id = $pf_display_id; 
    2204         my $pf_description = mysql_quotes($array->description()); 
    2205         my $pf_release = mysql_quotes($array->name()); 
    2206         my $pf_status = mysql_quotes("XREF"); #enum 
    2207         my $pf_version = mysql_quotes($array->format()); 
    2208         my $pf_info_text = mysql_quotes($array->type()); 
    2209         my $pf_info_type = mysql_quotes("SEQUENCE_MATCH"); #enum 
    2210         #my $pf_synonyms = $probe->get_all_probenames(); 
    2211         my $pf_syns = ''; ##join("|", @$pf_synonyms); 
    2212         $pf_syns = mysql_quotes($pf_syns); 
    2213  
    2214         #print "PROBE: $pf_primary_id | $pf_dbname | $pf_release \n"; 
    2215  
    2216         $ADMIN_Xrefs{$pf_dbname} = [$pf_dbname, $pf_display_id, $pf_primary_id, $pf_description, $pf_syns, 
    2217                              $pf_release, $pf_status, $pf_version, $pf_info_text, $pf_info_type]; 
    2218  
    2219         if ($pf_dbname =~ /^\'AFFY/i && $pf_release !~ /Ex-/){  #catch all types #skip all exon arrays 
    2220             $ADMIN_Xrefs{$pf_dbname}[10] = "\'Y\'"; # collected 
    2221             if (!${$seen{Affy}{$pf_primary_id}}++){ 
    2222                 $$GeneTables{Affy}{$count.$dot.$subcount{Affy}} = [$pf_primary_id, $pf_dbname, $pf_release]; 
    2223                 $$Ensembl_GeneTables{Affy}{$count.$dot.$subcount{Affy}} = [$gene_stable_id, $pf_primary_id]; 
     2229          }  
     2230          $gene_stable_id = mysql_quotes($gene->stable_id()); 
     2231          if (${$seen{Gene}{$gene_stable_id}}++){  
     2232                next; 
     2233          }  
     2234 
     2235        ## Collect info on first occurence   
     2236        my $db_release = $p_dbname."-".$p_release; 
     2237        if (!${$seen{DbRel}{$db_release}}++){ 
     2238          my $p_description = mysql_quotes($array->description()); 
     2239          my $p_status = mysql_quotes("XREF"); #enum 
     2240          my $p_version = mysql_quotes($array->format()); 
     2241          my $p_info_text = mysql_quotes($array->type()); 
     2242          my $p_info_type = mysql_quotes("SEQUENCE_MATCH"); #enum 
     2243          my $p_syns = mysql_quotes(""); 
     2244 
     2245          $ADMIN_Xrefs{$db_release} = [$p_dbname, $p_primary_id, $p_primary_id, $p_description, $p_syns, 
     2246                             $p_release, $p_status, $p_version, $p_info_text, $p_info_type]; 
     2247        } 
     2248 
     2249        if ($p_dbname =~ /^\'AFFY/i #catch all types 
     2250          && $p_release !~ /Ex-/ && $p_release !~ /U133/){  #skip all exon array and old arrays 
     2251            $ADMIN_Xrefs{$db_release}[10] = "\'Y\'"; # collected 
     2252                $$GeneTables{Affy}{$count.$dot.$subcount{Affy}} = [$p_primary_id, $p_dbname, $p_release]; 
     2253                $$Ensembl_GeneTables{Affy}{$count.$dot.$subcount{Affy}} = [$gene_stable_id, $p_primary_id]; 
    22242254                ++$subcount{Affy}; 
    2225             } 
    22262255        } 
    2227         elsif ($pf_dbname =~ /^\'Agilent/i){  #catch all types 
    2228             $ADMIN_Xrefs{$pf_dbname}[10] = "\'Y\'"; # collected 
    2229             if (!${$seen{Agilent}{$pf_primary_id}}++){ 
    2230                 $$GeneTables{Agilent}{$count.$dot.$subcount{Agilent}} = [$pf_primary_id, $pf_dbname, $pf_release]; 
    2231                 $$Ensembl_GeneTables{Agilent}{$count.$dot.$subcount{Agilent}} = [$gene_stable_id, $pf_primary_id]; 
     2256        elsif ($p_dbname =~ /^\'Agilent/i){  #catch all types 
     2257            $ADMIN_Xrefs{$db_release}[10] = "\'Y\'"; # collected 
     2258                $$GeneTables{Agilent}{$count.$dot.$subcount{Agilent}} = [$p_primary_id, $p_dbname, $p_release]; 
     2259                $$Ensembl_GeneTables{Agilent}{$count.$dot.$subcount{Agilent}} = [$gene_stable_id, $p_primary_id]; 
    22322260                ++$subcount{Agilent}; 
    2233             } 
    22342261        } 
    2235         elsif ($pf_dbname =~ /^\'Illumina/i){ #catch all types   
    2236             $ADMIN_Xrefs{$pf_dbname}[10] = "\'Y\'"; # collected 
    2237             if (!${$seen{Illumina}{$pf_primary_id}}++){ 
    2238                 $$GeneTables{Illumina}{$count.$dot.$subcount{Illumina}} = [$pf_primary_id, $pf_dbname, $pf_release]; 
    2239                 $$Ensembl_GeneTables{Illumina}{$count.$dot.$subcount{Illumina}} = [$gene_stable_id, $pf_primary_id]; 
     2262        elsif ($p_dbname =~ /^\'Illumina/i){ #catch all types   
     2263            $ADMIN_Xrefs{$db_release}[10] = "\'Y\'"; # collected 
     2264                $$GeneTables{Illumina}{$count.$dot.$subcount{Illumina}} = [$p_primary_id, $p_dbname, $p_release]; 
     2265                $$Ensembl_GeneTables{Illumina}{$count.$dot.$subcount{Illumina}} = [$gene_stable_id, $p_primary_id]; 
    22402266                ++$subcount{Illumina}; 
    2241             } 
    22422267        } 
    22432268        else { 
    2244             $ADMIN_Xrefs{$pf_dbname}[10] = "\'N\'"; # not collected! 
     2269            $ADMIN_Xrefs{$db_release}[10] = "\'N\'"; # not collected! 
    22452270        } 
    2246  
     2271        } 
    22472272      } 
     2273        ++$count; 
    22482274    } 
     2275  } 
    22492276} 
    22502277