Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PoC: compare-performance of external logs #3817

Merged
merged 8 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 50 additions & 10 deletions extensions/ql-vscode/src/common/jsonl-reader.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,66 @@
import { readFile } from "fs-extra";
import { statSync } from "fs";
import { createReadStream } from "fs-extra";
import { createInterface } from "readline";
import { extLogger } from "./logging/vscode";

/**
* Read a file consisting of multiple JSON objects. Each object is separated from the previous one
* by a double newline sequence. This is basically a more human-readable form of JSONL.
*
* The current implementation reads the entire text of the document into memory, but in the future
* it will stream the document to improve the performance with large documents.
*
* @param path The path to the file.
* @param handler Callback to be invoked for each top-level JSON object in order.
*/
export async function readJsonlFile<T>(
path: string,
handler: (value: T) => Promise<void>,
): Promise<void> {
const logSummary = await readFile(path, "utf-8");
function parseJsonFromCurrentLines() {
try {
return JSON.parse(currentLineSequence.join("\n")) as T;
} catch (e) {
void extLogger.log(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
`Error: Failed to parse at line ${lineCount} of ${path} as JSON: ${(e as any)?.message ?? "UNKNOWN REASON"}. Problematic line below:\n${JSON.stringify(currentLineSequence, null, 2)}`,
);
throw e;
}
}

// Remove newline delimiters because summary is in .jsonl format.
const jsonSummaryObjects: string[] = logSummary.split(/\r?\n\r?\n/g);
function logProgress() {
void extLogger.log(
`Processed ${lineCount} lines with ${parseCounts} parses...`,
);
}

for (const obj of jsonSummaryObjects) {
const jsonObj = JSON.parse(obj) as T;
await handler(jsonObj);
void extLogger.log(
`Parsing ${path} (${statSync(path).size / 1024 / 1024} MB)...`,
);
const fileStream = createReadStream(path, "utf8");
const rl = createInterface({
input: fileStream,
crlfDelay: Infinity,
});

let lineCount = 0;
let parseCounts = 0;
let currentLineSequence: string[] = [];
for await (const line of rl) {
if (line === "") {
// as mentioned above: a double newline sequence indicates the end of the current JSON object, so we parse it and pass it to the handler
await handler(parseJsonFromCurrentLines());
parseCounts++;
currentLineSequence = [];
} else {
currentLineSequence.push(line);
}
lineCount++;
if (lineCount % 100000 === 0) {
logProgress();
}
}
// in case the file is not newline-terminated, we need to handle the last JSON object
if (currentLineSequence.length > 0) {
await handler(parseJsonFromCurrentLines());
}
logProgress();
}
Loading
Loading