@@ -457,6 +457,93 @@ - (NSString *)stringByEscapingQuotes {
457
457
stringByReplacingOccurrencesOfString: @" \" " withString: @" \\\" " ];
458
458
}
459
459
460
+ // Returns the number of valid bytes in a sequence from a row in table 3-7 of the Unicode 6.2 spec.
461
+ // Returns 0 if no bytes are valid (a true maximal subpart is never less than 1).
462
+ static int maximal_subpart_of_row (const unsigned char *datap,
463
+ int datalen,
464
+ int bytesInRow,
465
+ int *min, // array of min values, with |bytesInRow| elements.
466
+ int *max) // array of max values, with |bytesInRow| elements.
467
+ {
468
+ for (int i = 0 ; i < bytesInRow && i < datalen; i++) {
469
+ const int v = datap[i];
470
+ if (v < min[i] || v > max[i]) {
471
+ return i;
472
+ }
473
+ }
474
+ return bytesInRow;
475
+ }
476
+
477
+ // This function finds the longest intial sequence of bytes that look like a valid UTF-8 sequence.
478
+ // It's used to gobble them up and replace them with a <?> replacement mark in an invalid sequence.
479
+ static int minimal_subpart (const unsigned char *datap, int datalen)
480
+ {
481
+ // This comes from table 3-7 in http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf
482
+ struct {
483
+ int numBytes; // Num values in min, max arrays
484
+ int min[4 ]; // Minimum values for each byte in a utf-8 sequence.
485
+ int max[4 ]; // Max values.
486
+ } wellFormedSequencesTable[] = {
487
+ {
488
+ 1 ,
489
+ { 0x00 , -1 , -1 , -1 , },
490
+ { 0x7f , -1 , -1 , -1 , },
491
+ },
492
+ {
493
+ 2 ,
494
+ { 0xc2 , 0x80 , -1 , -1 , },
495
+ { 0xdf , 0xbf , -1 , -1 },
496
+ },
497
+ {
498
+ 3 ,
499
+ { 0xe0 , 0xa0 , 0x80 , -1 , },
500
+ { 0xe0 , 0xbf , 0xbf , -1 },
501
+ },
502
+ {
503
+ 3 ,
504
+ { 0xe1 , 0x80 , 0x80 , -1 , },
505
+ { 0xec , 0xbf , 0xbf , -1 , },
506
+ },
507
+ {
508
+ 3 ,
509
+ { 0xed , 0x80 , 0x80 , -1 , },
510
+ { 0xed , 0x9f , 0xbf , -1 },
511
+ },
512
+ {
513
+ 3 ,
514
+ { 0xee , 0x80 , 0x80 , -1 , },
515
+ { 0xef , 0xbf , 0xbf , -1 , },
516
+ },
517
+ {
518
+ 4 ,
519
+ { 0xf0 , 0x90 , 0x80 , -1 , },
520
+ { 0xf0 , 0xbf , 0xbf , -1 , },
521
+ },
522
+ {
523
+ 4 ,
524
+ { 0xf1 , 0x80 , 0x80 , 0x80 , },
525
+ { 0xf3 , 0xbf , 0xbf , 0xbf , },
526
+ },
527
+ {
528
+ 4 ,
529
+ { 0xf4 , 0x80 , 0x80 , 0x80 , },
530
+ { 0xf4 , 0x8f , 0xbf , 0xbf },
531
+ },
532
+ { -1 , { -1 }, { -1 } }
533
+ };
534
+
535
+ int longest = 0 ;
536
+ for (int row = 0 ; wellFormedSequencesTable[row].numBytes > 0 ; row++) {
537
+ longest = MAX (longest,
538
+ maximal_subpart_of_row (datap,
539
+ datalen,
540
+ wellFormedSequencesTable[row].numBytes ,
541
+ wellFormedSequencesTable[row].min ,
542
+ wellFormedSequencesTable[row].max ));
543
+ }
544
+ return MIN (datalen, MAX (1 , longest));
545
+ }
546
+
460
547
int decode_utf8_char (const unsigned char *datap,
461
548
int datalen,
462
549
int * restrict result)
@@ -508,8 +595,16 @@ int decode_utf8_char(const unsigned char *datap,
508
595
}
509
596
510
597
if (theChar < smallest[utf8Length]) {
511
- // Reject overlong sequences.
512
- return -utf8Length;
598
+ // A too-long sequence was used to encode a value. For example, a 4-byte sequence must encode
599
+ // a value of at least 0x10000 (it is F0 90 80 80). A sequence like F0 8F BF BF is invalid
600
+ // because there is a 3-byte sequence to encode U+FFFF (the sequence is EF BF BF).
601
+ return -minimal_subpart (datap, datalen);
602
+ }
603
+
604
+ // Reject UTF-16 surrogates. They are invalid UTF-8 sequences.
605
+ // Reject characters above U+10FFFF, as they are also invalid UTF-8 sequences.
606
+ if ((theChar >= 0xD800 && theChar <= 0xDFFF ) || theChar > 0x10FFFF ) {
607
+ return -minimal_subpart (datap, datalen);
513
608
}
514
609
515
610
*result = (int )theChar;
0 commit comments