Skip to content

Commit cf4bded

Browse files
jkrukekikelkik
andauthored
Only skip first n lines in the first chunk and don't take the first line as header (mholt#1045) (mholt#1046)
* Only skip first n lines in the first chunk and don't take the first line as header (mholt#1045) * use newline from config (or guess if unset) --------- Co-authored-by: Jonas Krukenberg <[email protected]>
1 parent a116779 commit cf4bded

File tree

3 files changed

+86
-74
lines changed

3 files changed

+86
-74
lines changed

docs/resources/js/papaparse.js

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,16 @@ License: MIT
491491
this.parseChunk = function(chunk, isFakeChunk)
492492
{
493493
// First chunk pre-processing
494+
const skipFirstNLines = parseInt(this._config.skipFirstNLines) || 0;
495+
if (this.isFirstChunk && skipFirstNLines > 0) {
496+
let _newline = this._config.newline;
497+
if (!_newline) {
498+
const quoteChar = this._config.quoteChar || '"';
499+
_newline = this._handle.guessLineEndings(chunk, quoteChar);
500+
}
501+
const splitChunk = chunk.split(_newline);
502+
chunk = [...splitChunk.slice(skipFirstNLines)].join(_newline);
503+
}
494504
if (this.isFirstChunk && isFunction(this._config.beforeFirstChunk))
495505
{
496506
var modifiedChunk = this._config.beforeFirstChunk(chunk);
@@ -503,7 +513,6 @@ License: MIT
503513
// Rejoin the line we likely just split in two by chunking the file
504514
var aggregate = this._partialLine + chunk;
505515
this._partialLine = '';
506-
507516
var results = this._handle.parse(aggregate, this._baseIndex, !this._finished);
508517

509518
if (this._handle.paused() || this._handle.aborted()) {
@@ -1048,7 +1057,7 @@ License: MIT
10481057
{
10491058
var quoteChar = _config.quoteChar || '"';
10501059
if (!_config.newline)
1051-
_config.newline = guessLineEndings(input, quoteChar);
1060+
_config.newline = this.guessLineEndings(input, quoteChar);
10521061

10531062
_delimiterError = false;
10541063
if (!_config.delimiter)
@@ -1119,6 +1128,32 @@ License: MIT
11191128
_input = '';
11201129
};
11211130

1131+
this.guessLineEndings = function(input, quoteChar)
1132+
{
1133+
input = input.substr(0, 1024 * 1024); // max length 1 MB
1134+
// Replace all the text inside quotes
1135+
var re = new RegExp(escapeRegExp(quoteChar) + '([^]*?)' + escapeRegExp(quoteChar), 'gm');
1136+
input = input.replace(re, '');
1137+
1138+
var r = input.split('\r');
1139+
1140+
var n = input.split('\n');
1141+
1142+
var nAppearsFirst = (n.length > 1 && n[0].length < r[0].length);
1143+
1144+
if (r.length === 1 || nAppearsFirst)
1145+
return '\n';
1146+
1147+
var numWithN = 0;
1148+
for (var i = 0; i < r.length; i++)
1149+
{
1150+
if (r[i][0] === '\n')
1151+
numWithN++;
1152+
}
1153+
1154+
return numWithN >= r.length / 2 ? '\r\n' : '\r';
1155+
};
1156+
11221157
function testEmptyLine(s) {
11231158
return _config.skipEmptyLines === 'greedy' ? s.join('').trim() === '' : s.length === 1 && s[0].length === 0;
11241159
}
@@ -1321,32 +1356,6 @@ License: MIT
13211356
};
13221357
}
13231358

1324-
function guessLineEndings(input, quoteChar)
1325-
{
1326-
input = input.substr(0, 1024 * 1024); // max length 1 MB
1327-
// Replace all the text inside quotes
1328-
var re = new RegExp(escapeRegExp(quoteChar) + '([^]*?)' + escapeRegExp(quoteChar), 'gm');
1329-
input = input.replace(re, '');
1330-
1331-
var r = input.split('\r');
1332-
1333-
var n = input.split('\n');
1334-
1335-
var nAppearsFirst = (n.length > 1 && n[0].length < r[0].length);
1336-
1337-
if (r.length === 1 || nAppearsFirst)
1338-
return '\n';
1339-
1340-
var numWithN = 0;
1341-
for (var i = 0; i < r.length; i++)
1342-
{
1343-
if (r[i][0] === '\n')
1344-
numWithN++;
1345-
}
1346-
1347-
return numWithN >= r.length / 2 ? '\r\n' : '\r';
1348-
}
1349-
13501359
function addError(type, code, msg, row)
13511360
{
13521361
_results.errors.push({

papaparse.js

Lines changed: 37 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,16 @@ License: MIT
511511
this.parseChunk = function(chunk, isFakeChunk)
512512
{
513513
// First chunk pre-processing
514+
const skipFirstNLines = parseInt(this._config.skipFirstNLines) || 0;
515+
if (this.isFirstChunk && skipFirstNLines > 0) {
516+
let _newline = this._config.newline;
517+
if (!_newline) {
518+
const quoteChar = this._config.quoteChar || '"';
519+
_newline = this._handle.guessLineEndings(chunk, quoteChar);
520+
}
521+
const splitChunk = chunk.split(_newline);
522+
chunk = [...splitChunk.slice(skipFirstNLines)].join(_newline);
523+
}
514524
if (this.isFirstChunk && isFunction(this._config.beforeFirstChunk))
515525
{
516526
var modifiedChunk = this._config.beforeFirstChunk(chunk);
@@ -522,22 +532,6 @@ License: MIT
522532

523533
// Rejoin the line we likely just split in two by chunking the file
524534
var aggregate = this._partialLine + chunk;
525-
this._pendingSkip = parseInt(this._config.skipFirstNLines) || 0;
526-
this._skipHeader = 0;
527-
if (this._config.header) {
528-
this._skipHeader++;
529-
}
530-
if (this._pendingSkip > 0) {
531-
var splitChunk = aggregate.split('\n');
532-
var currentChunkLength = splitChunk.length;
533-
if (currentChunkLength <= this._pendingSkip) {
534-
aggregate = this._partialLine;
535-
}
536-
else{
537-
aggregate = this._partialLine + [...splitChunk.slice(0, this._skipHeader), ...splitChunk.slice(this._skipHeader + this._pendingSkip)].join('\n');
538-
}
539-
this._pendingSkip -= currentChunkLength;
540-
}
541535
this._partialLine = '';
542536
var results = this._handle.parse(aggregate, this._baseIndex, !this._finished);
543537

@@ -1093,7 +1087,7 @@ License: MIT
10931087
{
10941088
var quoteChar = _config.quoteChar || '"';
10951089
if (!_config.newline)
1096-
_config.newline = guessLineEndings(input, quoteChar);
1090+
_config.newline = this.guessLineEndings(input, quoteChar);
10971091

10981092
_delimiterError = false;
10991093
if (!_config.delimiter)
@@ -1167,6 +1161,32 @@ License: MIT
11671161
_input = '';
11681162
};
11691163

1164+
this.guessLineEndings = function(input, quoteChar)
1165+
{
1166+
input = input.substring(0, 1024 * 1024); // max length 1 MB
1167+
// Replace all the text inside quotes
1168+
var re = new RegExp(escapeRegExp(quoteChar) + '([^]*?)' + escapeRegExp(quoteChar), 'gm');
1169+
input = input.replace(re, '');
1170+
1171+
var r = input.split('\r');
1172+
1173+
var n = input.split('\n');
1174+
1175+
var nAppearsFirst = (n.length > 1 && n[0].length < r[0].length);
1176+
1177+
if (r.length === 1 || nAppearsFirst)
1178+
return '\n';
1179+
1180+
var numWithN = 0;
1181+
for (var i = 0; i < r.length; i++)
1182+
{
1183+
if (r[i][0] === '\n')
1184+
numWithN++;
1185+
}
1186+
1187+
return numWithN >= r.length / 2 ? '\r\n' : '\r';
1188+
};
1189+
11701190
function testEmptyLine(s) {
11711191
return _config.skipEmptyLines === 'greedy' ? s.join('').trim() === '' : s.length === 1 && s[0].length === 0;
11721192
}
@@ -1373,32 +1393,6 @@ License: MIT
13731393
};
13741394
}
13751395

1376-
function guessLineEndings(input, quoteChar)
1377-
{
1378-
input = input.substring(0, 1024 * 1024); // max length 1 MB
1379-
// Replace all the text inside quotes
1380-
var re = new RegExp(escapeRegExp(quoteChar) + '([^]*?)' + escapeRegExp(quoteChar), 'gm');
1381-
input = input.replace(re, '');
1382-
1383-
var r = input.split('\r');
1384-
1385-
var n = input.split('\n');
1386-
1387-
var nAppearsFirst = (n.length > 1 && n[0].length < r[0].length);
1388-
1389-
if (r.length === 1 || nAppearsFirst)
1390-
return '\n';
1391-
1392-
var numWithN = 0;
1393-
for (var i = 0; i < r.length; i++)
1394-
{
1395-
if (r[i][0] === '\n')
1396-
numWithN++;
1397-
}
1398-
1399-
return numWithN >= r.length / 2 ? '\r\n' : '\r';
1400-
}
1401-
14021396
function addError(type, code, msg, row)
14031397
{
14041398
var error = {

tests/test-cases.js

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,11 +1585,11 @@ var PARSE_TESTS = [
15851585
}
15861586
},
15871587
{
1588-
description: "Skip First N number of lines , with header and 3 rows",
1589-
input: 'a,b,c,d\n1,2,3,4\n4,5,6,7',
1588+
description: "Skip First N number of lines , with header and 2 rows",
1589+
input: 'to-be-ignored\na,b,c,d\n1,2,3,4',
15901590
config: { header: true, skipFirstNLines: 1 },
15911591
expected: {
1592-
data: [{a: '4', b: '5', c: '6', d: '7'}],
1592+
data: [{a: '1', b: '2', c: '3', d: '4'}],
15931593
errors: []
15941594
}
15951595
},
@@ -1610,6 +1610,15 @@ var PARSE_TESTS = [
16101610
data: [['a','b','c','d'],['1','2','3','4'],['4','5','6','7']],
16111611
errors: []
16121612
}
1613+
},
1614+
{
1615+
description: "Skip first 2 lines , with custom newline character",
1616+
input: 'skip-this\rskip-this\r1,2,3,4',
1617+
config: { header: false, skipFirstNLines: 2, newline: '\r' },
1618+
expected: {
1619+
data: [['1','2','3','4']],
1620+
errors: []
1621+
}
16131622
}
16141623
];
16151624

0 commit comments

Comments
 (0)