|
| 1 | +#include <vector> |
| 2 | +#include <string> |
| 3 | +#include <iostream> |
| 4 | +#include <fstream> |
| 5 | +#include <sstream> |
| 6 | +#include <cstdio> |
| 7 | +#include <map> |
| 8 | +#include <limits> |
| 9 | +#include <ctime> |
| 10 | +#include <cstdio> |
| 11 | +#include <cstring> |
| 12 | +#include <utility> |
| 13 | +#include <cmath> |
| 14 | +#include <algorithm> |
| 15 | +#include "lib/gtf-cpp/gtf.h" |
| 16 | +#include "lib/pdqsort/pdqsort.h" |
| 17 | +#include "lib/fasta-cpp/fasta.h" |
| 18 | +#include "introns.h" |
| 19 | + |
| 20 | +static int get_introns(const std::string& gtffile, |
| 21 | + std::vector<introns::Intron>& outputintrons) { |
| 22 | + |
| 23 | + // open the GTF file |
| 24 | + GTFFile gtf; |
| 25 | + gtf.setfilename(gtffile); |
| 26 | + try { |
| 27 | + gtf.load_filter([](const auto& seq) { |
| 28 | + return seq.feature == "exon"; |
| 29 | + }); |
| 30 | + } catch (const GTFError& e) { |
| 31 | + std::cerr << e.what() << '\n'; |
| 32 | + return 1; |
| 33 | + } |
| 34 | + |
| 35 | + std::vector<GTFSequence>& tmpexons = gtf.getall(); |
| 36 | + |
| 37 | + // sort exons by start |
| 38 | + pdqsort(tmpexons.begin(), tmpexons.end(), [](GTFSequence& a, GTFSequence& b) { |
| 39 | + return a.start < b.start; |
| 40 | + }); |
| 41 | + |
| 42 | + // split the exons into a map: |
| 43 | + /* |
| 44 | + * { |
| 45 | + * gene_id: { |
| 46 | + * transcript_id: { |
| 47 | + * { exons, introns } |
| 48 | + * } |
| 49 | + * } |
| 50 | + * } |
| 51 | + */ |
| 52 | + std::map<std::string, // gene_id |
| 53 | + std::map<std::string, // transcript_id |
| 54 | + std::pair<std::vector<GTFSequence>, // exons |
| 55 | + std::vector<introns::Intron>>>> // introns |
| 56 | + exons_by_transcript_by_gene; |
| 57 | + for (auto& exon : tmpexons) { |
| 58 | + exons_by_transcript_by_gene |
| 59 | + [exon.attributes["gene_id"]] |
| 60 | + [exon.attributes["transcript_id"]].first.push_back(exon); |
| 61 | + } |
| 62 | + |
| 63 | + // clean up memory as needed |
| 64 | + for (auto& [gene_id, tmap] : exons_by_transcript_by_gene) { |
| 65 | + for (auto& [tid, emap] : tmap) { |
| 66 | + emap.first.shrink_to_fit(); |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + std::cout << "-> Loaded " << tmpexons.size() << " exons\n"; |
| 71 | + |
| 72 | + outputintrons.clear(); |
| 73 | + std::string tmpstr; |
| 74 | + int tcount = 0; |
| 75 | + for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) { |
| 76 | + for (auto& [transcript_id, pair] : exons_by_transcript) { |
| 77 | + //if (tcount++ == 10) goto done; |
| 78 | + //std::cerr << "gene_id: " << gene_id << "; transcript " << transcript_id << '\n'; |
| 79 | + auto& [exons, _introns] = pair; |
| 80 | + for (std::size_t i = 0; i < exons.size() - 1; i++) { |
| 81 | + // 5' is the first 14 chars, 3' is the last 18 |
| 82 | + // the full sequence is everything minus the first 3 and last 4 |
| 83 | + _introns.push_back({ |
| 84 | + exons[i].seqname,//sequence |
| 85 | + "", // 3' |
| 86 | + "",//tmpstr.substr(tmpstr.size() - 18), // 3' |
| 87 | + "", // B' |
| 88 | + "", // whole intron |
| 89 | + exons[i].end-2, exons[i+1].start+3, // start and end |
| 90 | + transcript_id, |
| 91 | + gene_id, |
| 92 | + { transcript_id }, // set the list of tids |
| 93 | + .strand = exons[i].strand, // negative? |
| 94 | + }); |
| 95 | + } |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | +done: |
| 100 | + |
| 101 | +#if 0 |
| 102 | + // keep a map of gene id to transcript id to duplicate counts |
| 103 | + std::map<std::string, std::map<std::string, unsigned long long>> dupecounts; |
| 104 | + |
| 105 | + for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) { |
| 106 | + for (auto& [transcript_id, pair] : exons_by_transcript) { |
| 107 | + auto& [exons, _introns] = pair; |
| 108 | + for (auto& intron : _introns) { |
| 109 | + for (auto& [ts_id, pair2] : exons_by_transcript) { |
| 110 | + auto& [exons2, _introns2] = pair2; |
| 111 | + for (auto& other : _introns2) { |
| 112 | + if (&other == &intron) continue; |
| 113 | + if (!other.keep_in_output) continue; |
| 114 | + if (std::abs((long long)intron.start - (long long)other.start) <= 20L && std::abs((long long)intron.end - (long long)other.end) <= 20L) { |
| 115 | + intron.keep_in_output = true; |
| 116 | + dupecounts[gene_id][ts_id]++; |
| 117 | + if (std::find(other.all_transcripts.begin(), |
| 118 | + other.all_transcripts.end(), |
| 119 | + transcript_id) |
| 120 | + == other.all_transcripts.end()) { |
| 121 | + other.all_transcripts.push_back(transcript_id); |
| 122 | + } |
| 123 | + } else { |
| 124 | + // if they overlap still, we need to check if one is |
| 125 | + // really big and throw it out if it's huge |
| 126 | + if (intron.start < other.end && intron.end > other.start) { |
| 127 | + if (intron.full_sequence.size() > other.full_sequence.size()) { |
| 128 | + intron.keep_in_output = false; |
| 129 | + } else if (other.full_sequence.size() > intron.full_sequence.size()) { |
| 130 | + other.keep_in_output = false; |
| 131 | + } |
| 132 | + } |
| 133 | + } |
| 134 | + } |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + } |
| 139 | +#endif |
| 140 | + |
| 141 | + for (auto& [gene_id, exons_by_transcript] : exons_by_transcript_by_gene) { |
| 142 | + for (auto& [transcript_id, pair] : exons_by_transcript) { |
| 143 | + auto& [exons, _introns] = pair; |
| 144 | + for (auto& intron : _introns) { |
| 145 | + if (intron.keep_in_output) { |
| 146 | + outputintrons.push_back(intron); |
| 147 | + } |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + outputintrons.shrink_to_fit(); |
| 153 | + |
| 154 | + std::cout << "-> Loaded " << outputintrons.size() << " introns\n"; |
| 155 | + |
| 156 | +#if 0 |
| 157 | + std::time_t t = std::time(nullptr); |
| 158 | + char mbstr[100]; |
| 159 | + std::strftime(mbstr, 100, "duplicatestats-%Y%m%d%H%M.csv", std::localtime(&t)); |
| 160 | + |
| 161 | + std::ofstream outfile(mbstr); |
| 162 | + if (!outfile) { |
| 163 | + std::cerr << "Error opening " << mbstr << " for writing!\n" |
| 164 | + << "Continuing without duplicate stats output.\n"; |
| 165 | + } else { |
| 166 | + outfile << "\"gene_id\",\"transcript_id\",\"duplicates\"\r\n"; |
| 167 | + for (auto& [gid, ts] : dupecounts) { |
| 168 | + for (auto& [tid, dupes] : ts) { |
| 169 | + outfile << '"' << gid << "\",\"" << tid << "\"," << dupes << "\r\n"; |
| 170 | + } |
| 171 | + } |
| 172 | + outfile.close(); |
| 173 | + std::cout << "-> Wrote duplicate stats to " << mbstr << '\n'; |
| 174 | + } |
| 175 | +#endif |
| 176 | + |
| 177 | + return 0; |
| 178 | +} |
| 179 | + |
| 180 | +int main(int argc, char** argv) { |
| 181 | + if (argc < 4) { |
| 182 | + std::cerr << |
| 183 | +R"usage( |
| 184 | +usage: getcoords <gtf> <out+> <out-> |
| 185 | + gtf: the GTF file to read from |
| 186 | + out+: the file to write positive strand introns to |
| 187 | + out-: the file to write negative strand introns to |
| 188 | +)usage"; |
| 189 | + return 1; |
| 190 | + } |
| 191 | + std::string fname = argv[1]; |
| 192 | + std::string posfile = argv[2]; |
| 193 | + std::string negfile = argv[3]; |
| 194 | + |
| 195 | + std::vector<introns::Intron> introns; |
| 196 | + get_introns(fname, introns); |
| 197 | + |
| 198 | + std::ofstream outpos(posfile); |
| 199 | + if (!outpos) { |
| 200 | + std::cerr << "Could not open positive file for writing!\n"; |
| 201 | + return 1; |
| 202 | + } |
| 203 | + |
| 204 | + std::ofstream outneg(negfile); |
| 205 | + if (!outneg) { |
| 206 | + std::cerr << "Could not open negative file for writing!\n"; |
| 207 | + return 1; |
| 208 | + } |
| 209 | + |
| 210 | + for (auto& i : introns) { |
| 211 | + (i.strand == '-' ? outneg : outpos) << i.gene_id << ',' << i.transcript_id << ',' << i.sequence << ':' << i.start << '-' << i.end << '\n'; |
| 212 | + } |
| 213 | + |
| 214 | + return 0; |
| 215 | +} |
0 commit comments