Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d374c68

Browse files
committedMar 17, 2020
some stuff + a new segfault
1 parent 180314f commit d374c68

File tree

6 files changed

+567
-1
lines changed

6 files changed

+567
-1
lines changed
 

‎getcoords.cpp

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#include <vector>
2+
#include <string>
3+
#include <iostream>
4+
#include <fstream>
5+
#include <sstream>
6+
#include <cstdio>
7+
#include <map>
8+
#include <limits>
9+
#include <ctime>
10+
#include <cstdio>
11+
#include <cstring>
12+
#include <utility>
13+
#include <cmath>
14+
#include <algorithm>
15+
#include "lib/gtf-cpp/gtf.h"
16+
#include "lib/pdqsort/pdqsort.h"
17+
#include "lib/fasta-cpp/fasta.h"
18+
#include "introns.h"
19+
20+
static int get_introns(const std::string& gtffile,
21+
std::vector<introns::Intron>& outputintrons) {
22+
23+
// open the GTF file
24+
GTFFile gtf;
25+
gtf.setfilename(gtffile);
26+
try {
27+
gtf.load_filter([](const auto& seq) {
28+
return seq.feature == "exon";
29+
});
30+
} catch (const GTFError& e) {
31+
std::cerr << e.what() << '\n';
32+
return 1;
33+
}
34+
35+
std::vector<GTFSequence>& tmpexons = gtf.getall();
36+
37+
// sort exons by start
38+
pdqsort(tmpexons.begin(), tmpexons.end(), [](GTFSequence& a, GTFSequence& b) {
39+
return a.start < b.start;
40+
});
41+
42+
// split the exons into a map:
43+
/*
44+
* {
45+
* gene_id: {
46+
* transcript_id: {
47+
* { exons, introns }
48+
* }
49+
* }
50+
* }
51+
*/
52+
std::map<std::string, // gene_id
53+
std::map<std::string, // transcript_id
54+
std::pair<std::vector<GTFSequence>, // exons
55+
std::vector<introns::Intron>>>> // introns
56+
exons_by_transcript_by_gene;
57+
for (auto& exon : tmpexons) {
58+
exons_by_transcript_by_gene
59+
[exon.attributes["gene_id"]]
60+
[exon.attributes["transcript_id"]].first.push_back(exon);
61+
}
62+
63+
// clean up memory as needed
64+
for (auto& [gene_id, tmap] : exons_by_transcript_by_gene) {
65+
for (auto& [tid, emap] : tmap) {
66+
emap.first.shrink_to_fit();
67+
}
68+
}
69+
70+
std::cout << "-> Loaded " << tmpexons.size() << " exons\n";
71+
72+
outputintrons.clear();
73+
std::string tmpstr;
74+
int tcount = 0;
75+
for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) {
76+
for (auto& [transcript_id, pair] : exons_by_transcript) {
77+
//if (tcount++ == 10) goto done;
78+
//std::cerr << "gene_id: " << gene_id << "; transcript " << transcript_id << '\n';
79+
auto& [exons, _introns] = pair;
80+
for (std::size_t i = 0; i < exons.size() - 1; i++) {
81+
// 5' is the first 14 chars, 3' is the last 18
82+
// the full sequence is everything minus the first 3 and last 4
83+
_introns.push_back({
84+
exons[i].seqname,//sequence
85+
"", // 3'
86+
"",//tmpstr.substr(tmpstr.size() - 18), // 3'
87+
"", // B'
88+
"", // whole intron
89+
exons[i].end-2, exons[i+1].start+3, // start and end
90+
transcript_id,
91+
gene_id,
92+
{ transcript_id }, // set the list of tids
93+
.strand = exons[i].strand, // negative?
94+
});
95+
}
96+
}
97+
}
98+
99+
done:
100+
101+
#if 0
102+
// keep a map of gene id to transcript id to duplicate counts
103+
std::map<std::string, std::map<std::string, unsigned long long>> dupecounts;
104+
105+
for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) {
106+
for (auto& [transcript_id, pair] : exons_by_transcript) {
107+
auto& [exons, _introns] = pair;
108+
for (auto& intron : _introns) {
109+
for (auto& [ts_id, pair2] : exons_by_transcript) {
110+
auto& [exons2, _introns2] = pair2;
111+
for (auto& other : _introns2) {
112+
if (&other == &intron) continue;
113+
if (!other.keep_in_output) continue;
114+
if (std::abs((long long)intron.start - (long long)other.start) <= 20L && std::abs((long long)intron.end - (long long)other.end) <= 20L) {
115+
intron.keep_in_output = true;
116+
dupecounts[gene_id][ts_id]++;
117+
if (std::find(other.all_transcripts.begin(),
118+
other.all_transcripts.end(),
119+
transcript_id)
120+
== other.all_transcripts.end()) {
121+
other.all_transcripts.push_back(transcript_id);
122+
}
123+
} else {
124+
// if they overlap still, we need to check if one is
125+
// really big and throw it out if it's huge
126+
if (intron.start < other.end && intron.end > other.start) {
127+
if (intron.full_sequence.size() > other.full_sequence.size()) {
128+
intron.keep_in_output = false;
129+
} else if (other.full_sequence.size() > intron.full_sequence.size()) {
130+
other.keep_in_output = false;
131+
}
132+
}
133+
}
134+
}
135+
}
136+
}
137+
}
138+
}
139+
#endif
140+
141+
for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) {
142+
for (auto& [transcript_id, pair] : exons_by_transcript) {
143+
auto& [exons, _introns] = pair;
144+
for (auto& intron : _introns) {
145+
if (intron.keep_in_output) {
146+
outputintrons.push_back(intron);
147+
}
148+
}
149+
}
150+
}
151+
152+
outputintrons.shrink_to_fit();
153+
154+
std::cout << "-> Loaded " << outputintrons.size() << " introns\n";
155+
156+
#if 0
157+
std::time_t t = std::time(nullptr);
158+
char mbstr[100];
159+
std::strftime(mbstr, 100, "duplicatestats-%Y%m%d%H%M.csv", std::localtime(&t));
160+
161+
std::ofstream outfile(mbstr);
162+
if (!outfile) {
163+
std::cerr << "Error opening " << mbstr << " for writing!\n"
164+
<< "Continuing without duplicate stats output.\n";
165+
} else {
166+
outfile << "\"gene_id\",\"transcript_id\",\"duplicates\"\r\n";
167+
for (auto& [gid, ts] : dupecounts) {
168+
for (auto& [tid, dupes] : ts) {
169+
outfile << '"' << gid << "\",\"" << tid << "\"," << dupes << "\r\n";
170+
}
171+
}
172+
outfile.close();
173+
std::cout << "-> Wrote duplicate stats to " << mbstr << '\n';
174+
}
175+
#endif
176+
177+
return 0;
178+
}
179+
180+
int main(int argc, char** argv) {
181+
if (argc < 4) {
182+
std::cerr <<
183+
R"usage(
184+
usage: getcoords <gtf> <out+> <out->
185+
gtf: the GTF file to read from
186+
out+: the file to write positive strand introns to
187+
out-: the file to write negative strand introns to
188+
)usage";
189+
return 1;
190+
}
191+
std::string fname = argv[1];
192+
std::string posfile = argv[2];
193+
std::string negfile = argv[3];
194+
195+
std::vector<introns::Intron> introns;
196+
get_introns(fname, introns);
197+
198+
std::ofstream outpos(posfile);
199+
if (!outpos) {
200+
std::cerr << "Could not open positive file for writing!\n";
201+
return 1;
202+
}
203+
204+
std::ofstream outneg(negfile);
205+
if (!outneg) {
206+
std::cerr << "Could not open negative file for writing!\n";
207+
return 1;
208+
}
209+
210+
for (auto& i : introns) {
211+
(i.strand == '-' ? outneg : outpos) << i.gene_id << ',' << i.transcript_id << ',' << i.sequence << ':' << i.start << '-' << i.end << '\n';
212+
}
213+
214+
return 0;
215+
}

‎introns.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,13 @@ namespace introns {
3333

3434
// Structure representing an intron
3535
struct Intron {
36+
std::string sequence;
3637
// 5' sequence
3738
std::string five_prime;
3839
// 3' sequence
3940
std::string three_prime;
41+
// B' sequence
42+
std::string branch_point;
4043
// full intron *not counting upstream and downstream exon nt*
4144
std::string full_sequence;
4245
// start and end point (inclusive; no exonic nts)
@@ -114,8 +117,10 @@ namespace introns {
114117

115118
Intron operator!(Intron& i) {
116119
Intron r = {
120+
i.sequence,
117121
reverse(complement(i.five_prime)),
118122
reverse(complement(i.three_prime)),
123+
reverse(complement(i.branch_point)),
119124
reverse(complement(i.full_sequence)),
120125
i.start,
121126
i.end,

‎lib/fasta-cpp

‎main.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,10 @@ static int get_introns(const std::string& gtffile,
214214
// 5' is the first 14 chars, 3' is the last 18
215215
// the full sequence is everything minus the first 3 and last 4
216216
_introns.push_back({
217+
exons[i].seqname, // sequence
217218
tmpstr.substr(0, 14), // 5'
218219
tmpstr.substr(tmpstr.size() - 18), // 3'
220+
"", // B'
219221
tmpstr.substr(3, tmpstr.size() - 7), // whole intron
220222
exons[i].end+1, exons[i+1].start-1, // start and end
221223
transcript_id,

0 commit comments

Comments
 (0)
Please sign in to comment.