package PackSNP; use strict; use vars qw($VERSION); $VERSION = '1.3'; =head1 NAME PackSNP - Perl module for packing and unpacking SNP genotyping data =head1 SYNOPSIS use PackSNP; ######packing and unpacking div_allele_blob data############# ##unpacking div_allele value $datablob = PackSNP->new_header ( -packing_version => "1", -blob_type => "1", -number_of_sites=>5, -genome_version=>"3a", -chr => "chr8", -start_pos=>1, -end_pos => 1000000, -accession => "B73"); $seqstr = "ACGTW"; #pack div_allele value ##return a reference to a binary scalar value with both header and packed data $packed_seq_ref = $datablob->pack_data(\$seqstr); ##unpacking div_allele value #the unpack function takes in a reference to a binary scalar value, and return an array with two elements: a header object and a reference to a string value ($unpacked_blob_obj, $unpacked_seq_str_ref) = PackSNP->unpack_blob($packed_seq_ref); ##print unpqcked value print "unpacked div_allele value blob\n"; print $$unpacked_seq_str_ref, "\n"; print $unpacked_blob_obj->packing_version(), "\n"; print $unpacked_blob_obj->blob_type(), "\n"; print $unpacked_blob_obj->number_of_sites(), "\n"; print $unpacked_blob_obj->genome_version(), "\n"; print $unpacked_blob_obj->chr(), "\n"; print $unpacked_blob_obj->start_pos(), "\n"; print $unpacked_blob_obj->end_pos(), "\n"; print $unpacked_blob_obj->accession(), "\n"; print "\n"; ######packing and unpacking div_allele__assay blob data############# $pos_str = "12 233 344 444 555"; ##pack position header $datablob->blob_type(2); $datablob->accession(""); $packed_pos_ref = $datablob->pack_data(\$pos_str); ##unpacking div_allele positions ($unpacked_blob_obj, $unpacked_pos_str) = PackSNP->unpack_blob($packed_pos_ref); print "unpacked div_allele value blob\n"; print $$unpacked_pos_str, "\n"; print $unpacked_blob_obj->packing_version(), "\n"; print $unpacked_blob_obj->blob_type(), "\n"; print $unpacked_blob_obj->number_of_sites(), "\n"; print $unpacked_blob_obj->genome_version(), "\n"; print $unpacked_blob_obj->chr(), "\n"; print $unpacked_blob_obj->start_pos(), "\n"; print $unpacked_blob_obj->end_pos(), "\n"; print "\n"; =head1 DESCRIPTION This module can be used for packing and unpacking the SNP data. =head2 Methods =over 4 =item * $object= PackSNP->new_header ( -packing_version => , -blob_type => , -number_of_sites=> , -genome_version=> , -chr => , -start_pos=> , -end_pos => , -accession => ); Constructor for creating new header object =item * $object->pack_data($data_ref) This function takes in a reference to the data string. Returns a reference to the binary value of packed data. =item * unpack_blob($blobdata) Returns an array of two elements: reference to a header object and a reference to the value string. =item * $object->packing_version($data_ref) get/set packing version =back =head1 AUTHOR Qi Sun (qisun@cornell.edu) =head1 COPYRIGHT Copyright 2009 Cornell University. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO perl(1). =cut our %iu2hex = qw( A 0 C 1 G 2 T 3 R 4 Y 5 S 6 W 7 K 8 M 9 B A D B H C V D N E - F ); our %hex2iu = qw( 0 A 1 C 2 G 3 T 4 R 5 Y 6 S 7 W 8 K 9 M A B B D C H D V E N F - ); sub new_data_blob{ } sub new_header { my($caller,@args) = @_; # we know our inherietance heirarchy my $self = {@args}; bless $self,$caller; return $self; #padding } sub packing_version { my $self = shift; my $packing_version = shift; if ($packing_version) { $self->{-packing_version} = $packing_version; } return $self->{-packing_version} } sub blob_type { my $self = shift; my $newtype = shift; if ($newtype) { $self->{-blob_type} = $newtype; } return $self->{-blob_type}; } sub number_of_sites { my $self = shift; my $number_of_sites = shift; if ($number_of_sites) { $self->{-number_of_sites} = $number_of_sites; } return $self->{-number_of_sites}; } sub genome_version { my $self = shift; my $genome_version = shift; if ($genome_version) { $self->{-genome_version} = $genome_version; } return $self->{-genome_version}; } sub chr { my $self = shift; my $chr = shift; if ($chr) { $self->{-chr} = $chr; } return $self->{-chr}; } sub start_pos { my $self = shift; my $start_pos = shift; if ($start_pos) { $self->{-start_pos} = $start_pos; } return $self->{-start_pos}; } sub end_pos { my $self = shift; my $end_pos = shift; if ($end_pos) { $self->{-end_pos} = $end_pos; } return $self->{-end_pos}; } sub accession { my $self = shift; my $accession = shift; if ($accession) { $self->{-accession} = $accession; } return $self->{-accession}; } sub unpack_blob { my $s = shift; my $blob_ref = shift; my ($pack_version, $blob_type, $number_of_sites) = unpack "A3AL", $$blob_ref; my ($header_obj, $converted_value); #version 1 div_allele_value blob if (($pack_version == 1) && ($blob_type==1)) { my ($t1, $t2, $t3, $genome_version, $chr, $start_pos, $end_pos, $accession, $t4, $value) = unpack "A3ALA10A25LLA25A948H*", $$blob_ref; for (my $i=0; $i<$number_of_sites; $i++) { $converted_value .= $hex2iu{substr($value, $i, 1)}; } $header_obj = PackSNP->new_header ( -packing_version => $pack_version, -blob_type => $blob_type, -number_of_sites=>$number_of_sites, -genome_version=>$genome_version, -chr => $chr, -start_pos=>$start_pos, -end_pos => $end_pos, -accession => $accession); } elsif (($pack_version == 1) && ($blob_type==2)) { my ($t1, $t2, $t3, $genome_version, $chr, $start_pos, $end_pos, $accession, $t4, @positions) = unpack "A3ALA10A25LLA25A948L*", $$blob_ref; $header_obj = PackSNP->new_header ( -packing_version => $pack_version, -blob_type => $blob_type, -number_of_sites=>$number_of_sites, -genome_version=>$genome_version, -chr => $chr, -start_pos=>$start_pos, -end_pos => $end_pos, -accession => $accession); $converted_value = join " ", @positions; } return ($header_obj, \$converted_value); } sub header { my $self = shift; my $packingvalue=""; my $pack_version = $self->{-packing_version}; if ($pack_version == 1) { #version 1 packing #check blob type my $blob_type = $self->{-blob_type}; unless ($blob_type =~/^[12]$/) { print STDERR "Error: unrecognized blob type! 1 or 2 only\n"; exit; } #check number of sites my $number_of_sites = $self->{-number_of_sites}; unless ($number_of_sites =~/^\d+$/) { print STDERR "Error: number of sites not available or not correct!\n"; exit; } #check genome version my $genome_version = $self->{-genome_version}; unless ($genome_version =~/\w/) { print STDERR "Error: genome version not available or not correct!\n"; exit; } #check chromosome name my $chr = $self->{-chr}; unless ($chr =~/\w/) { print STDERR "Error: chr not available or not correct!\n"; exit; } #check start pos my $start_pos = $self->{-start_pos}; unless ($start_pos =~/^\d+$/) { $start_pos = ""; } #check end pos my $end_pos = $self->{-end_pos}; unless ($end_pos =~/^\d+$/) { $end_pos = ""; } #check accession my $accession = $self->{-accession}; if ($blob_type eq "2") { $accession = ""; } else { unless ($accession =~/\w/) { print STDERR "Error: accession not available or not correct!\n"; exit; } } $packingvalue = pack "A3ALA10A25LLA25A948", $pack_version, $blob_type, $number_of_sites, $genome_version, $chr, $start_pos, $end_pos, $accession, ""; } else { print STDERR "Error: packing version not recognized!\n"; exit; } return $packingvalue; } sub pack_data { my $self = shift; my $seqstr_ref = shift; unless ($$seqstr_ref=~/\w/){return;} my $packing_version = $self->packing_version(); my $blob_type = $self->blob_type(); my $blob_size = $self->{-number_of_sites}; my $packed = ""; if (($blob_type == 1) && ($packing_version==1)) { my $length = length $$seqstr_ref; unless ($blob_size == $length) { print STDERR "Error: Number of sites do not match the number of nucleotides!\n"; exit; } my $converted_seqstr = ""; my $i; for ($i=0; $i<$length; $i ++) { my $nt = $iu2hex{substr($$seqstr_ref, $i, 1)}; unless ($nt=~/\w/) { $nt = substr($$seqstr_ref, $i, 1); print STDERR "Error: $nt is not a IUPAC code!\n"; exit; } $converted_seqstr.= $nt; } $packed = pack "H$length", $converted_seqstr; } elsif (($blob_type == 2) && ($packing_version==1)) { my @positions = split /\s/, $$seqstr_ref; unless ($blob_size == @positions) { my $t = @positions; print STDERR "Error: Number of sites $blob_size do not match the number of positions $t!\n"; exit; } $packed = pack "L$blob_size", @positions; } $packed = $self->header().$packed; return \$packed; } 1;