Skip to content

Commit f47b1f8

Browse files
committed
Fix how replacement marks are printed in bogus utf-8 sequences. Fixes bug 2579
1 parent a9b3eca commit f47b1f8

File tree

3 files changed

+103
-14
lines changed

3 files changed

+103
-14
lines changed

NSStringITerm.m

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,93 @@ - (NSString *)stringByEscapingQuotes {
457457
stringByReplacingOccurrencesOfString:@"\"" withString:@"\\\""];
458458
}
459459

460+
// Returns the number of valid bytes in a sequence from a row in table 3-7 of the Unicode 6.2 spec.
461+
// Returns 0 if no bytes are valid (a true maximal subpart is never less than 1).
462+
static int maximal_subpart_of_row(const unsigned char *datap,
463+
int datalen,
464+
int bytesInRow,
465+
int *min, // array of min values, with |bytesInRow| elements.
466+
int *max) // array of max values, with |bytesInRow| elements.
467+
{
468+
for (int i = 0; i < bytesInRow && i < datalen; i++) {
469+
const int v = datap[i];
470+
if (v < min[i] || v > max[i]) {
471+
return i;
472+
}
473+
}
474+
return bytesInRow;
475+
}
476+
477+
// This function finds the longest intial sequence of bytes that look like a valid UTF-8 sequence.
478+
// It's used to gobble them up and replace them with a <?> replacement mark in an invalid sequence.
479+
static int minimal_subpart(const unsigned char *datap, int datalen)
480+
{
481+
// This comes from table 3-7 in http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf
482+
struct {
483+
int numBytes; // Num values in min, max arrays
484+
int min[4]; // Minimum values for each byte in a utf-8 sequence.
485+
int max[4]; // Max values.
486+
} wellFormedSequencesTable[] = {
487+
{
488+
1,
489+
{ 0x00, -1, -1, -1, },
490+
{ 0x7f, -1, -1, -1, },
491+
},
492+
{
493+
2,
494+
{ 0xc2, 0x80, -1, -1, },
495+
{ 0xdf, 0xbf, -1, -1 },
496+
},
497+
{
498+
3,
499+
{ 0xe0, 0xa0, 0x80, -1, },
500+
{ 0xe0, 0xbf, 0xbf, -1 },
501+
},
502+
{
503+
3,
504+
{ 0xe1, 0x80, 0x80, -1, },
505+
{ 0xec, 0xbf, 0xbf, -1, },
506+
},
507+
{
508+
3,
509+
{ 0xed, 0x80, 0x80, -1, },
510+
{ 0xed, 0x9f, 0xbf, -1 },
511+
},
512+
{
513+
3,
514+
{ 0xee, 0x80, 0x80, -1, },
515+
{ 0xef, 0xbf, 0xbf, -1, },
516+
},
517+
{
518+
4,
519+
{ 0xf0, 0x90, 0x80, -1, },
520+
{ 0xf0, 0xbf, 0xbf, -1, },
521+
},
522+
{
523+
4,
524+
{ 0xf1, 0x80, 0x80, 0x80, },
525+
{ 0xf3, 0xbf, 0xbf, 0xbf, },
526+
},
527+
{
528+
4,
529+
{ 0xf4, 0x80, 0x80, 0x80, },
530+
{ 0xf4, 0x8f, 0xbf, 0xbf },
531+
},
532+
{ -1, { -1 }, { -1 } }
533+
};
534+
535+
int longest = 0;
536+
for (int row = 0; wellFormedSequencesTable[row].numBytes > 0; row++) {
537+
longest = MAX(longest,
538+
maximal_subpart_of_row(datap,
539+
datalen,
540+
wellFormedSequencesTable[row].numBytes,
541+
wellFormedSequencesTable[row].min,
542+
wellFormedSequencesTable[row].max));
543+
}
544+
return MIN(datalen, MAX(1, longest));
545+
}
546+
460547
int decode_utf8_char(const unsigned char *datap,
461548
int datalen,
462549
int * restrict result)
@@ -508,8 +595,16 @@ int decode_utf8_char(const unsigned char *datap,
508595
}
509596

510597
if (theChar < smallest[utf8Length]) {
511-
// Reject overlong sequences.
512-
return -utf8Length;
598+
// A too-long sequence was used to encode a value. For example, a 4-byte sequence must encode
599+
// a value of at least 0x10000 (it is F0 90 80 80). A sequence like F0 8F BF BF is invalid
600+
// because there is a 3-byte sequence to encode U+FFFF (the sequence is EF BF BF).
601+
return -minimal_subpart(datap, datalen);
602+
}
603+
604+
// Reject UTF-16 surrogates. They are invalid UTF-8 sequences.
605+
// Reject characters above U+10FFFF, as they are also invalid UTF-8 sequences.
606+
if ((theChar >= 0xD800 && theChar <= 0xDFFF) || theChar > 0x10FFFF) {
607+
return -minimal_subpart(datap, datalen);
513608
}
514609

515610
*result = (int)theChar;

VT100Terminal.m

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,8 +1459,8 @@ static VT100TCC decode_xterm(unsigned char *datap,
14591459
BOOL unrecognized = NO;
14601460
if (datalen > 0) {
14611461
if (*datap != ';' && *datap != 'P') {
1462-
// Bogus first char after "esc ] [number]". Consume up to and
1463-
// including terminator and then return VT100_NOTSUPPORT.
1462+
// Bogus first char after "esc ] [number]". Consume up to and
1463+
// including terminator and then return VT100_NOTSUPPORT.
14641464
unrecognized = YES;
14651465
} else {
14661466
if (*datap == 'P') {
@@ -1914,14 +1914,6 @@ static VT100TCC decode_utf8(unsigned char *datap,
19141914
if (theChar < 0x80) {
19151915
break;
19161916
}
1917-
// Reject UTF-16 surrogates. They are invalid Unicode codepoints,
1918-
// and NSString initWithBytes fails on them.
1919-
// Reject characters above U+10FFFF. NSString uses UTF-16
1920-
// internally, so it cannot handle higher codepoints.
1921-
if ((theChar >= 0xD800 && theChar <= 0xDFFF) || theChar > 0x10FFFF) {
1922-
utf8DecodeResult = -utf8DecodeResult;
1923-
break;
1924-
}
19251917
p += utf8DecodeResult;
19261918
len -= utf8DecodeResult;
19271919
}
@@ -2583,7 +2575,7 @@ - (void)putStreamData:(NSData*)data
25832575
assert(current_stream_length >= 0);
25842576
if (current_stream_length == 0) {
25852577
streamOffset = 0;
2586-
}
2578+
}
25872579
}
25882580

25892581
- (NSData *)streamData

tests/bogusutf8.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
�12345
1+
Bogus single-byte UTF-8 should print ?12345: �12345
2+
Too-long sequence should print ?????: �����
3+
UTF-8 coding of surrogate value should print ???: ���

0 commit comments

Comments
 (0)