2017-12-29 13 views
-1

안녕하세요.펄 스크립트에서 몇 가지 변경 사항을 만드는 방법

분류학 과제 목록 (qiime 목록)이 있으며 txt 파일의 목록을 사용하여 bacterias 목록을 추출하고 싶습니다. 나는이 코드를 가지고 있는데, 현재 2 가지 문제가있다. 하나는 분류 파일에 # (qiime 목록)으로 시작하는 2 줄이 있고, 첫 번째 (biom 파일에서 생성 된 #)는 내가 피하기 원하는 것이다. 하나 (#OTU의 ID ..... 샘플) 나 각 열의 이름으로 사용하는 한, 나는 코드의 다른 장소에서

if ($_=~ m/Constructed from biom file/){ next; }

같은 것을 시도했다, 그러나 단지 돈 (하지만 파일에서 해당 줄을 제거하면 제대로 작동합니다.)하지만 중요한 점은 파일을 조작하고 싶지 않기 때문에 그 줄을 피하려는 이유입니다.

두 번째 문제점 (잘 만드는 방법을 알지 못하는 코드의 일부)이 스크립트에는 -s 옵션이 있습니다. 즉, 특정 열을 원하면 스크립트는 그 중 하나만 추출합니다. (예 : -s sample1, sample2, sample..n), 문제는 선택된 샘플이 주어진 행 (xRow 0.0 0.0 0.0)의 모든 열에서 cero (0)의 값을 가지며, 이 경우 스크립트에서 $ val [1]을 사용하지 않아야하지만 한 샘플에 대해서만 사용됩니다. 그러나 2 또는 mores 샘플의 경우 모든 행에 현재 cero가 있으면 피해야합니다 (이 경우 3 개의 샘플) .

스크립트 :

#!/usr/bin/env perl 
use strict; 
use warnings; 
use Getopt::Long; 
use Data::Dumper qw(Dumper); 
use List::MoreUtils qw(uniq); 
use List::Util qw(sum); 

    my ($search_label, $infile_taxon, $infile_list, $output_file); 

     GetOptions (
      'i=s' =>\$infile_taxon, 
      'l=s' =>\$infile_list, 
      's=s' =>\$search_label, 
      'o=s' =>\$output_file, 
      ); 

     sub match_genera { 
      my ($List_File, $Taxon_File) = @_; 
      my @lista_genera = @{ $List_File }; # dereferencing and copying each array 
      my @taxon_qiime = @{ $Taxon_File }; 
      my (@extract); 
       foreach (@lista_genera){ 
        my $unit = $_; 
        chomp $unit; 
        my @match = grep (/$unit/, @taxon_qiime); 
          push (@extract, @match); 
       } 
       return @extract; 
     } 



    open INFILE_TAXONOMY, '<', "$infile_taxon" or die $!; 
    open LIST_BACTERIA, '<', "$infile_list" or die $!; 
    open OUTPUT, '>', "xfile2.txt" or die $!; 

    chomp (my @sample_names = split '\t', <INFILE_TAXONOMY>); 
    shift @sample_names; 
    unshift @sample_names, '#Genera'; 

    my (@ToExtract, @no_match, @filter, @filter_columns); 

    # si ingresamos un -s 
    if ($search_label){ 
     my @wanted= split (/\,/, $search_label); 
     unshift @wanted, '#Genera'; 
     @wanted = uniq (@wanted); 

     foreach my $wanted_in (@wanted){ 
      if (my @match = grep (/$wanted_in/, @sample_names)) { 
       push (@ToExtract, @match); 
      } 
      else { 
       push (@no_match, $wanted_in); 
      } 
     } 

     if (grep {defined($_)} @no_match){ 
      print "\nSamples No Found: @no_match\n\n"; 
     } 
     print OUTPUT join "\t", @ToExtract, "\n"; 
    } 

    # si queremos toda la tabla !! 
    else{ 
     @ToExtract = @sample_names; 
     print OUTPUT join "\t", @ToExtract, "\n"; 
    } 

    # Extraer los varoles de cada linea para cada variable !!! 
    #my %row; 
    while (<INFILE_TAXONOMY>){ 
     my %row; 
     @row{@sample_names} = split "\t"; 
     @filter= join "\t", @row{@ToExtract}; 
      push (@filter_columns, @filter); # ahora ya se puede usar con la 
    } 

    my @list; 
    foreach (<LIST_BACTERIA>){ 
     if ($_=~ m/^#|^$/) { next;   } 
     else    { push @list, $_ } 
    } 
    my @filter_list = uniq (@list); 

    my @last = match_genera (\@filter_list, \@filter_columns); 

    my (@genera_taxon, @genera_final); 
    foreach (@last){ 
     @genera_taxon = ($_ =~ m/;D_5__(\w.*)/g); 
        foreach (@genera_taxon){ 
         if ($_ =~ m/^$/g) { next;     } 
         else    { push @genera_final, $_; } 
        } 
    } 

     foreach (@genera_final){ #split the line in a multiples var !!! 
      chomp; 
        my @val = split (/\t/, $_); 
        foreach ($val[1]){ 
         if ($_ == 0){ next; } 
         else{print join "\t", @val,"\n";} 
        } 

       } 

close INFILE_TAXONOMY; 
close LIST_BACTERIA; 
close OUTPUT; 
exit; 

분류 학적 파일 (qiime의 목록) 탭으로 구분 된 텍스트입니다! :

#Constructed from biom file 
#OTU ID sample1 sample2 sample3 
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__ABS-19;D_5__uncultured bacterium 0.002804 0.0073441109 0.0 
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__CA002;D_5__uncultured bacterium 0.0 0.001109 0.0 
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__Sva0725;D_5__uncultured bacterium 0.0 0.00882217 0.0014038202 
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 7;D_4__uncultured bacterium;D_5__ 0.0 0.0 0.00898876404 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 13;Ambiguous_taxa;D_4__;D_5__ 0.0 0.0 0.00140449438202 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 2;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00280898876404 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 21;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00421348314607 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 22;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00421348314607 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 22;D_3__uncultured prokaryote;D_4__;D_5__ 0.0 0.0 0.0014038202 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 25;D_3__uncultured Acidobacteria bacterium;D_4__;D_5__ 0.0012041933 0.0 0.0 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 5;D_3__uncultured bacterium;D_4__;D_5__ 0.00120401933 0.0 0.0 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 6;D_3__uncultured Acidobacteria bacterium;D_4__;D_5__ 0.0 0.00115473441109 0.0 
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 6;D_3__uncultured bacterium;D_4__;D_5__ 0.00180614087899 0.0 0.00280898876404 
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__OM1 clade;D_5__uncultured actinobacterium 0.0 0.0 0.00140449438202 
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__OM1 clade;D_5__uncultured bacterium 0.0 0.0 0.00561797752809 
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__Sva0996 marine group;D_5__uncultured bacterium 0.0 0.0 0.00280898876404 
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__uncultured;D_5__uncultured actinobacterium 0.00301023479831 0.00115473441109 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__uncultured;D_5__uncultured bacterium 0.000602059663 0.001173441109 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Dietziaceae;D_5__Dietzia 0.0150511739916 0.0311778290993 0.00140449438202 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Mycobacteriaceae;D_5__Mycobacterium 0.00240818865 0.002309882217 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Gordonia 0.0 0.0 0.00140449438202 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Rhodococcus 0.00240865 0.0013441109 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Williamsia 0.0 0.0 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Tsukamurellaceae;D_5__Tsukamurella 0.000020463 0.0 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__nbr16a11;D_5__uncultured bacterium 0.0014093 0.001134411 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Kineosporiales;D_4__Kineosporiaceae;D_5__Quadrisphaera 0.0 0.0014734 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Demequinaceae;D_5__Lysinimicrobium 0.00120409391933 0.0 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Intrasporangiaceae;D_5__Ornithinimicrobium 0.0006959663 0.0 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Intrasporangiaceae;D_5__Tetrasphaera 0.0 0.00441109 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Micrococcaceae;D_5__Glutamicibacter 0.0 0.0031408776 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Micrococcaceae;D_5__Pseudarthrobacter 0.0 0.002882217 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micromonosporales;D_4__Micromonosporaceae;D_5__Actinoplanes 0.0 0.0011441109 0.0 
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Propionibacteriales;D_4__Propionibacteriaceae;D_5__Propionibacterium 0.3479831 0.0882217 0.00280898876404 

목록 :

#list 
Gordonia 
Mycobacterium 
Ornithinimicrobium 
Marinobacter 
Pseudoalteromonas 
Pseudomonas 
Halomonas 
Alcanivorax 
Acinetobacter 
Shewanella 
Pseudidiomarina 
Microbulbifer 
Bacillus 
Microbacterium 
Achrornobacter 
Actinomyces 
Alcaligenes 

밖으로 인쇄 예상 (탭 한정된) :

#genera sample1  sample3 
Gordonia 0.00301023479831 0.00140449438202  
Mycobacterium 0.00240818783865 0.0 
Ornithinimicrobium 0.000602046959663 0.0 
Pseudomonas 0.367850692354 0.254213483146 
Halomonas 0.000602046959663 0.00140449438202  
Acinetobacter 0.00301023479831 0.00561797752809  
Bacillus 0.0626128838049 0.00280898876404  
Klebsiella 0.0138470800722 0.00280898876404  
Lactobacillus 0.000602046959663 0.0 
Acinetobacter 0.00301023479831 0.00561797752809  
Gordonia 0.00301023479831 0.00140449438202  
Rhodococcus 0.00240818783865 0.0 
Williamsia 0.000602046959663 0.0 
Streptomyces 0.000602046959663 0.0 
Dietzia 0.0150511739916 0.00140449438202  
Aquabacterium 0.000602046959663 0.0 
Janthinobacterium 0.0180614087899 0.0294943820225 
Massilia 0.000602046959663 0.00140449438202  
Noviherbaspirillum 0.000602046959663 0.0 
Rhodococcus 0.00240818783865 0.0 
Staphylococcus 0.166164960867 0.0688202247191 
Haemophilus 0.00120409391933 0.00280898876404  
Stenotrophomonas 0.000602046959663 0.00140449438202  
Candidatus Endomicrobium 0.00662251655629 0.0 
Candidatus Hepatincola 0.000602046959663 0.0 

마지막으로 코드를 더 간단하게 만들려고 노력했습니다. 새 스크립트에서 코드의 일부를 편집하려고 시도했지만 < INFILE_TAXONOMY 대신 @newarray를 사용하여 열을 선택했습니다. 새로운 배열에서리스트는 거의 완성되어 원하는 출력을 원하는대로 원하는 열을 선택하기 만하면됩니다. 배열의 다음 코드를 만드는 방법.

while (<INFILE_TAXONOMY>){ 
     my %row; 
     @row{@sample_names} = split "\t"; 
     @filter= join "\t", @row{@wanted}; 
      push (@filter_columns, @filter); 
    } 

특정 열을 선택하는 옵션이없는 새 스크립트 !!!

use strict; 
use warnings; 
use List::MoreUtils qw(uniq); 
use Data::Dumper qw(Dumper); 


#---------------------------Subrutina de extraccion para 2 arrays de lista de bacterias y taxones!!------------------------------------------------------------- 
     sub match_genera { 
      my ($List_File, $Taxon_File) = @_; 
      my @taxon_qiime = @{ $Taxon_File }; 
      my @lista_genera = @{ $List_File }; # dereferencing and copying each array 
      my (@match, @extract, @genera_clean); 

        foreach (@taxon_qiime){ 
        my @generas_taxon = ($_ =~ m/;D_5__(\w.*)/g); 
        foreach (@generas_taxon){ 
         if ($_ =~ m/^$/g) { next;     } 
         else    { push @genera_clean, $_; } 
        } 
        } 

        foreach (@lista_genera){ 
        my $list_unit = $_; 
        chomp $list_unit; 
        @match = grep (/$list_unit/, @genera_clean); 
          push (@extract, @match); 
        }   
       return @extract; 
     } 
#------------------------------------------------------FILES----------------------------------------------------------------------- 



      open INFILE_TAXONOMY, '<', "otu_table_L6_copy.txt" or die $!; 
      open LIST_BACTERIA, '<', "lista_degradadoras.txt" or die $!; 

      my (@lista_bacteria, @taxon, @sample_names); 


# -------------------------------------------------------------TAXON ------------------------------------------------------------- 
       foreach (<INFILE_TAXONOMY>){ 
        chomp; 
        if ($_=~ m/^$|Constructed from biom file/g) { next;      } 
        elsif ($_ =~ s/OTU ID/Genera/g)    { push @sample_names, $_; } 
        else           { push (@taxon, $_);   } 
       } 


# -------------------------------------------------------------LIST ------------------------------------------------------------- 
       foreach (<LIST_BACTERIA>){ 
        if ($_=~ m/^$|^#/g)  { next;     } 
        else     { push @lista_bacteria, $_  } 

       } 
      my @filter_list = uniq (@lista_bacteria); 

# ------------------------------------------------------------------------------------------------------------------------------- 


      my @match_all = match_genera (\@filter_list, \@taxon); 

      unshift @match_all, @sample_names; 

      @wanted= qw(sample1 sample3); 

      foreach (@match_all){ 
       print "$_\n"; 
       } 

      close INFILE_TAXONOMY; 
      close LIST_BACTERIA; 
      exit; 

답변

0

교체

<INFILE_TAXONOMY>; # Skip first line. 
chomp (my @sample_names = split '\t', <INFILE_TAXONOMY>); 

또는

chomp (my @sample_names = split '\t', <INFILE_TAXONOMY>); 

my $sample_names; 
while ($sample_names = <INFILE_TAXONOMY>) { 
    chomp($sample_names); 
    last if $sample_names !~ /Constructed from biom file/; 
} 

defined($sample_names) 
    or die("Premature EOF"); 

my @sample_names = split /\t/, $sample_names; 
+0

덕분에 너무, 두 번째 옵션은 잘 작동합니다, 감사합니다! – abraham

관련 문제