Skip to content

V0.4 #127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft

V0.4 #127

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
GeneticVariantBase = "2447270c-d849-4bf9-ac0d-b5c0b265991c"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Expand Down
7 changes: 6 additions & 1 deletion src/SnpArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ __precompile__()

module SnpArrays

using GeneticVariantBase
using CodecZlib, CodecXz, CodecBzip2, CodecZstd, TranscodingStreams
using Adapt, Glob, LinearAlgebra, LoopVectorization, Missings, Mmap, Printf
using Requires, SparseArrays, Statistics, StatsBase
Expand All @@ -18,14 +19,16 @@ import Tables: table
export AbstractSnpArray, AbstractSnpBitMatrix, AbstractSnpLinAlg
export SnpArray, SnpBitMatrix, SnpLinAlg, SnpData, StackedSnpArray
export compress_plink, decompress_plink, split_plink, merge_plink, write_plink
export counts, grm, grm_admixture, maf, mean, minorallele, missingpos, missingrate
export counts, grm, grm_admixture, maf, maf!, mean, minorallele, missingpos, missingrate
export std, var, vcf2plink
export counts, grm, maf, mean, minorallele, missingpos, missingrate, std, var
export vcf2plink, kinship_pruning
export ADDITIVE_MODEL, DOMINANT_MODEL, RECESSIVE_MODEL
export CuSnpArray
import VariantCallFormat: findgenokey, VCF, header

# this is exporting functions necessary

const ADDITIVE_MODEL = Val(1)
const DOMINANT_MODEL = Val(2)
const RECESSIVE_MODEL = Val(3)
Expand All @@ -43,6 +46,8 @@ include("linalg_bitmatrix.jl")
include("reorder.jl")
include("vcf2plink.jl")
include("admixture.jl")
include("iterator.jl")

AbstractSnpArray = Union{SnpArray, SubArray{UInt8, 1, SnpArray}, SubArray{UInt8, 2, SnpArray},
StackedSnpArray, SubArray{UInt8, 1, StackedSnpArray}, SubArray{UInt8, 2, StackedSnpArray}}

Expand Down
60 changes: 60 additions & 0 deletions src/SnpArrays.jl.31595.mem
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
- __precompile__()
-
14681029 module SnpArrays
-
- using GeneticVariantBase
- using CodecZlib, CodecXz, CodecBzip2, CodecZstd, TranscodingStreams
- using Adapt, Glob, LinearAlgebra, LoopVectorization, Missings, Mmap, Printf
- using Requires, SparseArrays, Statistics, StatsBase
- import Base: IndexStyle, convert, copyto!, eltype, getindex, setindex!, length, size, wait
- import DataFrames: DataFrame, rename!, eachrow
- import DelimitedFiles: readdlm, writedlm
- import CSV # for CSV.read, to avoid clash with Base.read
- import LinearAlgebra: copytri!, mul!
- import Statistics: mean, std, var
- import StatsBase: counts
- import SpecialFunctions: gamma_inc
- import VectorizationBase: gesp
- import Tables: table
- export AbstractSnpArray, AbstractSnpBitMatrix, AbstractSnpLinAlg
- export SnpArray, SnpBitMatrix, SnpLinAlg, SnpData, StackedSnpArray
- export compress_plink, decompress_plink, split_plink, merge_plink, write_plink
- export counts, grm, grm_admixture, maf, maf!, mean, minorallele, missingpos, missingrate
- export std, var, vcf2plink
- export counts, grm, maf, mean, minorallele, missingpos, missingrate, std, var
- export vcf2plink, kinship_pruning
- export ADDITIVE_MODEL, DOMINANT_MODEL, RECESSIVE_MODEL
- export CuSnpArray
- import VariantCallFormat: findgenokey, VCF, header
-
- # this is exporting functions necessary
-
- const ADDITIVE_MODEL = Val(1)
- const DOMINANT_MODEL = Val(2)
- const RECESSIVE_MODEL = Val(3)
-
- include("codec.jl")
- include("snparray.jl")
- include("stackedsnparray.jl")
- include("filter.jl")
- include("cat.jl")
- include("snpdata.jl")
- include("grm.jl")
- include("kinship_pruning.jl")
- include("linalg_direct.jl")
- include("linalg_bitmatrix.jl")
- include("reorder.jl")
- include("vcf2plink.jl")
- include("admixture.jl")
- include("iterator.jl")
-
- AbstractSnpArray = Union{SnpArray, SubArray{UInt8, 1, SnpArray}, SubArray{UInt8, 2, SnpArray},
- StackedSnpArray, SubArray{UInt8, 1, StackedSnpArray}, SubArray{UInt8, 2, StackedSnpArray}}
-
- datadir(parts...) = joinpath(@__DIR__, "..", "data", parts...)
-
- function __init__()
- @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
- end
-
- end # module
60 changes: 60 additions & 0 deletions src/SnpArrays.jl.31601.mem
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
- __precompile__()
-
- module SnpArrays
-
- using GeneticVariantBase
- using CodecZlib, CodecXz, CodecBzip2, CodecZstd, TranscodingStreams
- using Adapt, Glob, LinearAlgebra, LoopVectorization, Missings, Mmap, Printf
- using Requires, SparseArrays, Statistics, StatsBase
- import Base: IndexStyle, convert, copyto!, eltype, getindex, setindex!, length, size, wait
- import DataFrames: DataFrame, rename!, eachrow
- import DelimitedFiles: readdlm, writedlm
- import CSV # for CSV.read, to avoid clash with Base.read
- import LinearAlgebra: copytri!, mul!
- import Statistics: mean, std, var
- import StatsBase: counts
- import SpecialFunctions: gamma_inc
- import VectorizationBase: gesp
- import Tables: table
- export AbstractSnpArray, AbstractSnpBitMatrix, AbstractSnpLinAlg
- export SnpArray, SnpBitMatrix, SnpLinAlg, SnpData, StackedSnpArray
- export compress_plink, decompress_plink, split_plink, merge_plink, write_plink
- export counts, grm, grm_admixture, maf, maf!, mean, minorallele, missingpos, missingrate
- export std, var, vcf2plink
- export counts, grm, maf, mean, minorallele, missingpos, missingrate, std, var
- export vcf2plink, kinship_pruning
- export ADDITIVE_MODEL, DOMINANT_MODEL, RECESSIVE_MODEL
- export CuSnpArray
- import VariantCallFormat: findgenokey, VCF, header
-
- # this is exporting functions necessary
-
- const ADDITIVE_MODEL = Val(1)
- const DOMINANT_MODEL = Val(2)
- const RECESSIVE_MODEL = Val(3)
-
- include("codec.jl")
- include("snparray.jl")
- include("stackedsnparray.jl")
- include("filter.jl")
- include("cat.jl")
- include("snpdata.jl")
- include("grm.jl")
- include("kinship_pruning.jl")
- include("linalg_direct.jl")
- include("linalg_bitmatrix.jl")
- include("reorder.jl")
- include("vcf2plink.jl")
- include("admixture.jl")
- include("iterator.jl")
-
- AbstractSnpArray = Union{SnpArray, SubArray{UInt8, 1, SnpArray}, SubArray{UInt8, 2, SnpArray},
- StackedSnpArray, SubArray{UInt8, 1, StackedSnpArray}, SubArray{UInt8, 2, StackedSnpArray}}
-
- datadir(parts...) = joinpath(@__DIR__, "..", "data", parts...)
-
- function __init__()
0 @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
- end
-
- end # module
60 changes: 60 additions & 0 deletions src/SnpArrays.jl.31603.mem
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
- __precompile__()
-
- module SnpArrays
-
- using GeneticVariantBase
- using CodecZlib, CodecXz, CodecBzip2, CodecZstd, TranscodingStreams
- using Adapt, Glob, LinearAlgebra, LoopVectorization, Missings, Mmap, Printf
- using Requires, SparseArrays, Statistics, StatsBase
- import Base: IndexStyle, convert, copyto!, eltype, getindex, setindex!, length, size, wait
- import DataFrames: DataFrame, rename!, eachrow
- import DelimitedFiles: readdlm, writedlm
- import CSV # for CSV.read, to avoid clash with Base.read
- import LinearAlgebra: copytri!, mul!
- import Statistics: mean, std, var
- import StatsBase: counts
- import SpecialFunctions: gamma_inc
- import VectorizationBase: gesp
- import Tables: table
- export AbstractSnpArray, AbstractSnpBitMatrix, AbstractSnpLinAlg
- export SnpArray, SnpBitMatrix, SnpLinAlg, SnpData, StackedSnpArray
- export compress_plink, decompress_plink, split_plink, merge_plink, write_plink
- export counts, grm, grm_admixture, maf, maf!, mean, minorallele, missingpos, missingrate
- export std, var, vcf2plink
- export counts, grm, maf, mean, minorallele, missingpos, missingrate, std, var
- export vcf2plink, kinship_pruning
- export ADDITIVE_MODEL, DOMINANT_MODEL, RECESSIVE_MODEL
- export CuSnpArray
- import VariantCallFormat: findgenokey, VCF, header
-
- # this is exporting functions necessary
-
- const ADDITIVE_MODEL = Val(1)
- const DOMINANT_MODEL = Val(2)
- const RECESSIVE_MODEL = Val(3)
-
- include("codec.jl")
- include("snparray.jl")
- include("stackedsnparray.jl")
- include("filter.jl")
- include("cat.jl")
- include("snpdata.jl")
- include("grm.jl")
- include("kinship_pruning.jl")
- include("linalg_direct.jl")
- include("linalg_bitmatrix.jl")
- include("reorder.jl")
- include("vcf2plink.jl")
- include("admixture.jl")
- include("iterator.jl")
-
- AbstractSnpArray = Union{SnpArray, SubArray{UInt8, 1, SnpArray}, SubArray{UInt8, 2, SnpArray},
- StackedSnpArray, SubArray{UInt8, 1, StackedSnpArray}, SubArray{UInt8, 2, StackedSnpArray}}
-
- datadir(parts...) = joinpath(@__DIR__, "..", "data", parts...)
-
- function __init__()
0 @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
- end
-
- end # module
123 changes: 123 additions & 0 deletions src/iterator.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
mutable struct SnpArrayIterator <: GeneticVariantBase.VariantIterator
snpdata::SnpData
end

mutable struct SnpArrayIndex <: GeneticVariantBase.Variant
index::Int
end

@inline function Base.eltype(::Type{<:VariantIterator})
SnpArrayIndex
end

function Base.iterate(itr::SnpArrayIterator, state=1)
if state <= 0
throw(BoundsError(itr, state))
end
if state > size(itr.snpdata.snparray,2)
return nothing
else
index = SnpArrayIndex(state)
state = state + 1
return (index, state)
end
end

@inline function Base.length(itr::SnpArrayIterator)
return size(itr.snpdata.snparray, 2)
end

function GeneticVariantBase.chrom(s::SnpData, snpindex::SnpArrayIndex)::String
result = s.snp_info[snpindex.index,:chromosome]
return result
end

function GeneticVariantBase.pos(s::SnpData, snpindex::SnpArrayIndex)::Int
result = s.snp_info[snpindex.index,:position]
# println("entered pos function $result $snpindex.index")
return result
end

function GeneticVariantBase.rsid(s::SnpData, snpindex::SnpArrayIndex)::String
result = s.snp_info[snpindex.index,:snpid]
return result
end

#SnpData subtype of Genetic Data

function alleles(s::SnpData, snpindex::SnpArrayIndex)::Vector{String}
allele1 = s.snp_info[snpindex.index,:allele1]
allele2 = s.snp_info[snpindex.index,:allele2]
return [allele1, allele2]
end

function GeneticVariantBase.alt_allele(s::SnpData, snpindex::SnpArrayIndex)::String
alt = s.snp_info[snpindex.index,:allele2]
return alt
end

function GeneticVariantBase.ref_allele(s::SnpData, snpindex::SnpArrayIndex)::String
ref = s.snp_info[snpindex.index,:allele1]
return ref
end

struct MAFData
maf_vector::Vector{Float64}
end

# fold into GeneticVariantBase.maf function name

function calculate_maf_data(s::SnpData)
maf_vector = maf(s.snparray)
result = MAFData(maf_vector)
return result
end

function maf_index(maf_data::MAFData, snpindex::SnpArrayIndex)
return maf_data.maf_vector[snpindex.index]
end


function GeneticVariantBase.maf(s::SnpData, snpindex::SnpArrayIndex)
# maf_vector = calculate_maf_data(s)
maf_vector = maf(s.snparray)
return maf_vector[snpindex.index]
# return maf_vector[snpindex.index]
end

function GeneticVariantBase.hwepval(s::SnpData, snpindex::SnpArrayIndex)
genotypes = s.snparray[:,snpindex.index]

n00 = sum(genotypes .== 0x00)
n01 = sum(genotypes .== 0x02)
n11 = sum(genotypes .== 0x03)

pval = hwe(n00,n01,n11)
return pval

end

# 0 for homozygous allele 1
# 2 Heterozygous
# 3 homozygous allele 2
# 1 is for missing

function GeneticVariantBase.alt_dosages!(arr::AbstractArray{T}, s::SnpData, snpindex::SnpArrayIndex) where T <: Real
GeneticVariantBase.alt_genotypes!(arr, s, snpindex)
return arr
end

# make sure you can read in all genotypes for a sample
# filtering SNPS
function GeneticVariantBase.alt_genotypes!(arr::AbstractArray{T}, s::SnpData, snpindex::SnpArrayIndex) where T <: Real
Base.copyto!(arr, @view(s.snparray[:, snpindex.index]); impute=true, center=true)
return arr
end

function n_samples(s::SnpData)::Int
return size(s.snparray,1)
end

function n_variants(s::SnpData)::Int
return size(s.snparray,2)
end
2 changes: 1 addition & 1 deletion src/snpdata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const PERSON_INFO_KEYS = [:fid, :iid, :father, :mother, :sex, :phenotype]

Type to store SNP and person information along with the SnpArray.
"""
struct SnpData
struct SnpData <: GeneticData
people::Int
snps::Int
snparray::SnpArray
Expand Down
Loading