package PackSNP; use strict; use vars qw($VERSION); $VERSION = '1.5'; =head1 NAME PackSNP - Perl module for packing and unpacking SNP genotyping data =head1 SYNOPSIS use PackSNP; ######packing and unpacking div_allele_blob data############# ##unpacking div_allele value $datablob = PackSNP->new_header ( -packing_version => "1", -blob_type => "1", -number_of_sites=>5, -genome_version=>"3a", -chr => "chr8", -start_pos=>1, -end_pos => 1000000, -accession => "B73", -blob_class => 1); $seqstr = "ACGTW"; # marker position blob $position_blob = PackSNP->new_header ( -packing_version => "1", -blob_type => "2", -number_of_sites=>5, -genome_version=>"3a", -chr => "chr8", -start_pos=>1, -end_pos => 1000000, -blob_class => 2); $seqstr = "1622 37313 38992 48573 56995"; # marker id blob data_element_length refer to bit-length of string. = 8 * digits $position_blob = PackSNP->new_header ( -packing_version => "1", -blob_type => "5", -number_of_sites=>5, -genome_version=>"3a", -chr => "chr8", -start_pos=>1, -end_pos => 1000000, -data_element_length => 32, -blob_class => 5); $seqstr = "ID01 ID02 ID03 ID04 ID05"; # association pvalue blob $position_blob = PackSNP->new_header ( -packing_version => "1", -blob_type => "8", -number_of_sites=>5, -genome_version=>"3a", -chr => "chr8", -start_pos=>1, -end_pos => 1000000, -data_element_length => 32, -blob_class => 8, -class_specific_fields => "trait name\tgermplasm group name"); $seqstr = "0.12 0.13 0.15 0.15 0.17"; #pack div_allele value ##return a reference to a binary scalar value with both header and packed data $packed_seq_ref = $datablob->pack_data(\$seqstr); ##unpacking div_allele value #the unpack function takes in a reference to a binary scalar value, and return an array with two elements: a header object and a reference to a string value ($unpacked_blob_obj, $unpacked_seq_str_ref) = PackSNP->unpack_blob($packed_seq_ref); ##print unpqcked value print "unpacked div_allele value blob\n"; print $$unpacked_seq_str_ref, "\n"; print $unpacked_blob_obj->packing_version(), "\n"; print $unpacked_blob_obj->blob_type(), "\n"; print $unpacked_blob_obj->number_of_sites(), "\n"; print $unpacked_blob_obj->genome_version(), "\n"; print $unpacked_blob_obj->chr(), "\n"; print $unpacked_blob_obj->start_pos(), "\n"; print $unpacked_blob_obj->end_pos(), "\n"; print $unpacked_blob_obj->accession(), "\n"; print "\n"; ######packing and unpacking div_allele__assay blob data############# $pos_str = "12 233 344 444 555"; ##pack position header $datablob->blob_type(2); $datablob->accession(""); $packed_pos_ref = $datablob->pack_data(\$pos_str); ##unpacking div_allele positions ($unpacked_blob_obj, $unpacked_pos_str) = PackSNP->unpack_blob($packed_pos_ref); print "unpacked div_allele value blob\n"; print $$unpacked_pos_str, "\n"; print $unpacked_blob_obj->packing_version(), "\n"; print $unpacked_blob_obj->blob_type(), "\n"; print $unpacked_blob_obj->number_of_sites(), "\n"; print $unpacked_blob_obj->genome_version(), "\n"; print $unpacked_blob_obj->chr(), "\n"; print $unpacked_blob_obj->start_pos(), "\n"; print $unpacked_blob_obj->end_pos(), "\n"; print "\n"; =head1 DESCRIPTION This module can be used for packing and unpacking the SNP data. =head2 Methods =over 4 =item * $object= PackSNP->new_header ( -packing_version => , -blob_type => , -number_of_sites=> , -genome_version=> , -chr => , -start_pos=> , -end_pos => , -accession => ); Constructor for creating new header object =item * $object->pack_data($data_ref) This function takes in a reference to the data string. Returns a reference to the binary value of packed data. =item * unpack_blob($blob_ref) Returns an array of two elements: reference to a header object and a reference to the value string. =item * $object->packing_version($blob_ref) get/set packing version =item * $object->load_packed_blob_from_file($file) Loads packed binary blob from a file, returns unpacked binary blob. =item * $object->save_packed_blob_to_file($file) Saves packed binary blob from a file. =item * $object->seek_pos($blob_ref, $pos1, $pos2) Finds a range of indexes corresponding to range of positions. Return 4-elemnt array ($i1, $v1, $i2, $v2) where $iN are indexes and $vN are corresponding values. If there are no positions in the requested range returns (-1, $v1, -1, $v2), where $v1 and $v2 correspond to a closest exiting position range. =item * $object->get_range($blob_ref, $pos1, $pos2) Returns a space seprataed list of positions in the range defined by positions $pos1 and $pos2 or empty string when no positions exist in the requested range. =item * $object->get_data($blob_ref, $ind1, $ind2) Returns a string of SNPs in the range defined by indexes $ind1 and $ind2 or empty string when no positions exist in the requested range. =back =head1 AUTHOR Qi Sun (qisun@cornell.edu) Jaroslaw Pillardy (jarekp@cac.cornell.edu) Genevieve DeClerck (gad14@cornell.edu) =head1 COPYRIGHT Copyright 2009-10 Cornell University. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO perl(1). =cut our %iu2hex = qw( A 0 C 1 G 2 T 3 R 4 Y 5 S 6 W 7 K 8 M 9 B A D B H C V D N E a 0 c 1 g 2 t 3 r 4 y 5 s 6 w 7 k 8 m 9 b A d B h C v D n E - F ); our %hex2iu = qw( 0 A 1 C 2 G 3 T 4 R 5 Y 6 S 7 W 8 K 9 M A B B D C H D V E N F - ); sub new_data_blob{ } sub new_header { my($caller,@args) = @_; # we know our inherietance heirarchy my $self = {@args}; bless $self,$caller; return $self; #padding } sub packing_version { my $self = shift; my $packing_version = shift; if ($packing_version) { $self->{-packing_version} = $packing_version; } return $self->{-packing_version} } sub blob_type { my $self = shift; my $newtype = shift; if ($newtype) { $self->{-blob_type} = $newtype; } return $self->{-blob_type}; } sub number_of_sites { my $self = shift; my $number_of_sites = shift; if ($number_of_sites) { $self->{-number_of_sites} = $number_of_sites; } return $self->{-number_of_sites}; } sub genome_version { my $self = shift; my $genome_version = shift; if ($genome_version) { $self->{-genome_version} = $genome_version; } return $self->{-genome_version}; } sub chr { my $self = shift; my $chr = shift; if ($chr) { $self->{-chr} = $chr; } return $self->{-chr}; } sub start_pos { my $self = shift; my $start_pos = shift; if ($start_pos) { $self->{-start_pos} = $start_pos; } return $self->{-start_pos}; } sub end_pos { my $self = shift; my $end_pos = shift; if ($end_pos) { $self->{-end_pos} = $end_pos; } return $self->{-end_pos}; } sub accession { my $self = shift; my $accession = shift; if ($accession) { $self->{-accession} = $accession; } return $self->{-accession}; } sub data_element_length { my $self = shift; my $data_element_length = shift; if ($data_element_length) { $self->{-data_element_length} = $data_element_length; } return $self->{-data_element_length}; } sub blob_class { my $self = shift; my $blob_class = shift; if ($blob_class) { $self->{-blob_class} = $blob_class; } return $self->{-blob_class}; } sub class_specific_fields { my $self = shift; my $class_specific_fields = shift; if ($class_specific_fields) { $self->{-class_specific_fields} = $class_specific_fields; } return $self->{-class_specific_fields}; } sub unpack_blob { my $s = shift; my $blob_ref = shift; my ($pack_version, $blob_type, $number_of_sites) = unpack "a3aN", $$blob_ref; my ($header_obj, $converted_value); #version 1 div_allele_value blob if ($pack_version =~ /1/) { my ($t1, $t2, $t3, $genome_version, $chr, $start_pos, $end_pos, $accession, $data_element_length, $blob_class, $class_specific_fields) = unpack "a3aNa10a25NNa150Nna150", $$blob_ref; if ($blob_type eq "1") { unless ($blob_class) { $blob_class = 1; } unless ($data_element_length) { $data_element_length = 4; } unless ($class_specific_fields) { $class_specific_fields = ""; } my ($t4, $value) = unpack "a1024H*", $$blob_ref; for (my $i=0; $i<$number_of_sites; $i++) { #$converted_value .= $hex2iu{substr($value, $i, 1)}; $converted_value .= $hex2iu{uc(substr($value, $i, 1))}; } } elsif ($blob_type eq "2") { unless ($blob_class) { $blob_class = 2; } unless ($data_element_length) { $data_element_length = 32; } unless ($class_specific_fields) { $class_specific_fields = ""; } my ($t4, @positions) = unpack "a1024N*", $$blob_ref; $converted_value = join " ", @positions; } elsif ($blob_type eq "5") { unless ($blob_class) { $blob_class = 5; } unless ($class_specific_fields) { $class_specific_fields = ""; } if (($data_element_length % 8)>0) { print STDERR "ERROR: the data element length should be in bits. (8 * charactors)"; exit; } my $marker_id_length = int ($data_element_length / 8); my ($t4, @ids) = unpack "a1024". ("a$marker_id_length" x $number_of_sites), $$blob_ref; $converted_value = join " ", @ids; } elsif ($blob_type eq "8") { if ($] < 5.0092) { my $t = $]; $t =~s/\.0+/./; print STDERR "ERROR: You are using PERL $t. PERL 5.92 is required for packing/unpacking floating values."; exit; } unless ($data_element_length) { $data_element_length = 32; } unless ($class_specific_fields) { $class_specific_fields = ""; } my @ids; my ($t4, @values) = unpack "a1024(f*)>", $$blob_ref; $converted_value = join " ", @values; } $header_obj = PackSNP->new_header ( -packing_version => $pack_version, -blob_type => $blob_type, -number_of_sites=>$number_of_sites, -genome_version=>$genome_version, -chr => $chr, -start_pos=>$start_pos, -end_pos => $end_pos, -accession => $accession, -blob_class => $blob_class, -data_element_length => $data_element_length, -class_specific_fields => $class_specific_fields); } return ($header_obj, \$converted_value); } sub header { my $self = shift; my $packingvalue=""; my $pack_version = $self->{-packing_version}; if ($pack_version == 1) { #version 1 packing #check blob type my $blob_type = $self->{-blob_type}; unless ($blob_type =~/^[125]$/) # gd added 5 { print STDERR "Error: unrecognized blob type! 1 or 2 only\n"; exit; } #check number of sites my $number_of_sites = $self->{-number_of_sites}; unless ($number_of_sites =~/^\d+$/) { print STDERR "Error: number of sites not available or not correct!\n"; exit; } #check genome version my $genome_version = $self->{-genome_version}; unless ($genome_version =~/\w/) { print STDERR "Error: genome version not available or not correct!\n"; exit; } #check chromosome name my $chr = $self->{-chr}; unless ($chr =~/\w/) { print STDERR "Error: chr not available or not correct!\n"; exit; } #check start pos my $start_pos = $self->{-start_pos}; unless ($start_pos =~/^\d+$/) { $start_pos = ""; } #check end pos my $end_pos = $self->{-end_pos}; unless ($end_pos =~/^\d+$/) { $end_pos = ""; } #check accession my $accession = $self->{-accession}; if ($blob_type ne "1") { $accession = ""; } else { unless ($accession =~/\w/) { print STDERR "Error: blob accession not available or not correct! (blob_type: $blob_type)\n"; # gd added more info to error output exit; } } my $data_element_length = $self->{-data_element_length}; if ($data_element_length) { if (($blob_type eq "5") && (($data_element_length % 8) > 0) ) { print STDERR "Error: blob data_element_length should use bit-length (8 x number_of_charactors)! (blob_type: $blob_type)\n"; # gd added more info to error output exit; } } else { if ($blob_type eq "1") { $data_element_length = 4; } elsif ($blob_type eq "2") { $data_element_length = 32; } elsif ($blob_type eq "5") { print STDERR "Error: blob data_element_length not available or not correct! (blob_type: $blob_type)\n"; # gd added more info to error output exit; } } my $blob_class= $self->{-blob_class}; unless ($blob_class) { if ($blob_type eq "1") { $blob_class = 1; } elsif ($blob_type eq "2") { $blob_class = 2; } elsif ($blob_type eq "5") { $blob_class = 5; } } my $class_specific_fields= $self->{-class_specific_fields}; unless ($class_specific_fields) { $class_specific_fields = ""; } $packingvalue = pack "a3aNa10a25NNa150Nna150a667", $pack_version, $blob_type, $number_of_sites, $genome_version, $chr, $start_pos, $end_pos, $accession, $data_element_length, $blob_class, $class_specific_fields, ""; } else { print STDERR "Error: packing version not recognized!\n"; exit; } return $packingvalue; } sub pack_data { my $self = shift; my $seqstr_ref = shift; unless ($$seqstr_ref=~/\w/){return;} my $packing_version = $self->packing_version(); my $blob_type = $self->blob_type(); my $blob_size = $self->number_of_sites(); my $data_element_length = $self->data_element_length(); my $packed = ""; if (($blob_type == 1) && ($packing_version==1)) { my $length = length $$seqstr_ref; unless ($blob_size == $length) { print STDERR "Error: Number of sites do not match the number of nucleotides!\n"; exit; } my $converted_seqstr = ""; my $i; for ($i=0; $i<$length; $i ++) { my $nt = $iu2hex{substr($$seqstr_ref, $i, 1)}; unless ($nt=~/\w/) { $nt = substr($$seqstr_ref, $i, 1); print STDERR "Error: $nt is not a IUPAC code!\nBLOB TYPE: ".$self->blob_type()."\nCHROM: ".$self->chr()."\n"; exit; } $converted_seqstr.= $nt; } $packed = pack "H$length", $converted_seqstr; #if (($length%2) == 1) #{ # $packed .= pack "H", "E"; #} } elsif (($blob_type == 2) && ($packing_version==1)) { my @positions = split /\s/, $$seqstr_ref; unless ($blob_size == @positions) { my $t = @positions; print STDERR "Error: Number of sites $blob_size do not match the number of positions $t!\n"; exit; } $packed = pack "N$blob_size", @positions; } elsif (($blob_type == 5) && ($packing_version==1)) ## gd added to handle binary_id { my @ids = split /\s/, $$seqstr_ref; unless ($blob_size == @ids) { my $t = @ids; print STDERR "Error: Number of sites $blob_size do not match the number of ids $t!\n"; exit; } if (($data_element_length%8) > 0 ) { print STDERR "Error: blob data_element_length should use bit-length (8 x number_of_charactors)! \n"; exit; } my $marker_id_length = $data_element_length/8; $packed = pack "a$marker_id_length" x $blob_size, @ids; } elsif (($blob_type == 8) && ($packing_version==1)) ## gd added to handle binary_id { if ($] < 5.0092) { my $t = $]; $t =~s/\.0+/./; print "You are using PERL $t. PERL 5.92 is required for packing/unpacking floaing values."; exit; } my @values = split /\s/, $$seqstr_ref; unless ($blob_size == @values) { my $t = @values + 0; print STDERR "Error: Number of sites $blob_size do not match the number of ids $t!\n"; exit; } $packed = pack "(f$blob_size)>", @values; } $packed = $self->header().$packed; return \$packed; } sub load_packed_blob_from_file{ my $self = shift; my $file = shift; my $blob = ""; my $chunk = 104857600; my $buf; open IN, $file; binmode IN; my $i = 0; while(1) { $i++; my $n = read(IN, $buf, $chunk); if($i==1 && $n<1025) { print STDERR "ERROR: Invalid file $file\n"; exit; } if($n == 0){last;} $blob .= $buf; } close(IN); return $blob; } sub load_multi_blob_from_file{ my $self = shift; my $file = shift; my $blob = ""; my $chunk = 104857600; my $buf; open IN, $file; binmode IN; my $i = 0; while(1) { $i++; my $n = read(IN, $buf, $chunk); if($i==1 && $n<1025) { print STDERR "ERROR: Invalid file $file\n"; exit; } if($n == 0){last;} $blob .= $buf; } close(IN); my ($pack_version, $blob_type, $number_of_sites) = unpack "a3aN", $blob; my $size = length $blob; my $valueblobsize = 1024 + int($number_of_sites/2+0.5); my @bloblist = (); for ($i=0; $i<$size; $i+=$valueblobsize) { push @bloblist, substr($blob, $i, $valueblobsize); } return \@bloblist; } sub save_packed_blob_to_file{ my $self = shift; my $file = shift; my $blob = shift; open OUT, ">$file"; binmode OUT; print OUT $blob; close(OUT); } sub seek_pos { my $self = shift; my $blob = shift; # position blob my $pos1 = shift; # position start my $pos2 = shift; # position end # my $head = substr($$blob, 0, 1024); # (my $unpacked_blob_obj,my $unpacked_pos_str) = PackSNP->unpack_blob(\$head); # print "==============>unpacked value blob header\n"; # print "packing_version \t", $unpacked_blob_obj->packing_version(), "\n"; # print "blob_type \t\t", $unpacked_blob_obj->blob_type(), "\n"; # print "number_of_sites \t", $unpacked_blob_obj->number_of_sites(), "\n"; # print "genome_version \t\t", $unpacked_blob_obj->genome_version(), "\n"; # print "chr \t\t\t", $unpacked_blob_obj->chr(), "\n"; # print "start_pos \t\t", $unpacked_blob_obj->start_pos(), "\n"; # print "end_pos \t\t", $unpacked_blob_obj->end_pos(), "\n"; # print "==========\n\n"; my $blobtype = unpack("a", substr($$blob, 3, 1)); my $numsites = unpack("N", substr($$blob, 4, 4)); if( $blobtype != 2) { print STDERR "SEEK_POS ERROR: invalid blob type = " . $blobtype; print STDERR " positions are stored in blobs of type 2\n"; exit; } my $nb = 1024; my $n = length($$blob) - 1024; my $ind1 = -1; my $ind1a = -1; my $ind2 = -1; my $ind2a = -1; my $val1 = -1; my $val1a = -1; my $val2 = -1; my $val2a = -1; my $flag = 0; my $m = int($n/4) - 1; my $m1 = $n/4; if($m != $m1 - 1) { print STDERR "Invalid file length, data block does not align on 4-byte boundary\n"; exit; } my $loc0 = unpack ("N", substr($$blob, $nb, 4)); my $loc1 = unpack ("N", substr($$blob, $nb + 4*$m, 4)); # fixes bug that causes only 255 alleles/positions to be returned # when user enters an end coordinate that is greater than the last position # integer in the position blob string. gd. if($pos2 > $loc1){ $pos2 = $loc1; } if($m1 != $numsites) { print STDERR "Data error: number of sites $numsites is different than actual number of data units $m\n"; exit; } if(($pos1>=$loc0 && $pos1<=$loc1) && ($pos2>=$loc0 && $pos2<=$loc1)) { ($ind1a, $val1a, $ind1, $val1) = $self->locate($blob, $pos1, $nb, $n, -1, -1); if($val1 > $pos2) { $flag = 1; $ind1 = $ind1a; $val1 = $val1a; } ($ind2, $val2, $ind2a, $val2a) = $self->locate($blob, $pos2, $nb, $n, $ind1, -1); } elsif($pos1>=$loc0 && $pos1<=$loc1) { ($ind1a, $val1a, $ind1, $val1) = $self->locate($blob, $pos1, $nb, $n, -1, -1); } elsif($pos2>=$loc0 && $pos2<=$loc1) { ($ind2, $val2, $ind2a, $val2a) = $self->locate($blob, $pos2, $nb, $n, -1, -1); } if($pos1 < $loc0) { $val1 = $loc0; $ind1 = 0; } if($pos2 > $loc1) { $val2 = $loc1; $ind2 = $m1 - 1; } if($flag==1) { $val2 = $val2a; $ind1 = -1; $ind2 = -1; } return ($ind1, $val1, $ind2, $val2); } sub get_range{ my $self = shift; my $blob = shift; my $pos1 = shift; my $pos2 = shift; my $i1; my $v1; my $i2; my $v2; ($i1, $v1, $i2, $v2) = $self->seek_pos($blob, $pos1, $pos2); if($i1==-1 || $i2==-1){return "";} my $rngstr = $v1; for(my $i=$i1+1; $i<$i2; $i++) { $rngstr .= " " . unpack ("N", substr($$blob, 1024+$i*4, 4)); } if($rngstr ne $v2){ $rngstr .= " " . $v2; } return $rngstr; } sub get_data{ my $self = shift; my $blob = shift; my $i1 = shift; my $i2 = shift; if($i1==-1 || $i2==-1){return "";} my $len = $i2 - $i1 + 1; if ((${i1}%2) == 1) { $len ++; } my $datastr = "H" . $len; my $dataraw = unpack ($datastr, substr($$blob, int($i1/2) + 1024, $len)); if ((${i1}%2) == 1) { $dataraw = substr($dataraw, 1); } my $data = ""; for(my $i=0; $i 1) { $vv1 = unpack ("N", substr($$data, $offset+$ii1*4, 4)); $vv2 = unpack ("N", substr($$data, $offset+$ii2*4, 4)); #print "locate $ii1 $vv1 $spos $ii2 $vv2\n"; if($vv1 == $spos){return ($ii1, $vv1, $ii1, $vv1);} if($vv2 == $spos){return ($ii2, $vv2, $ii2, $vv2);} my $ii3 = int(($ii1 + $ii2)/2); my $vv3 = unpack ("N", substr($$data, $offset+$ii3*4, 4)); #print " $ii3 $vv3 \n"; if($vv3 == $spos){return ($ii3, $vv3, $ii3, $vv3);} if($vv3>$spos) { $ii2 = $ii3; $vv2 = $vv3; } else { $ii1 = $ii3; $vv1 = $vv3; } } #print "locate $ii1 $vv1 $spos $ii2 $vv2\n"; return ($ii1, $vv1, $ii2, $vv2); } 1;