From 5ee8027aba8852725a1c8e54b93ddb0d7bb08b01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Tue, 21 May 2019 10:55:59 +0200 Subject: [PATCH 1/3] support GSM National Language Shift Tables --- README.md | 12 +++- SMSCounter.php | 178 ++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 178 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fe94fd5..1629e0e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ use Instasent\SMSCounter\SMSCounter; $smsCounter = new SMSCounter(); $smsCounter->count('some-string-to-be-counted'); +$smsCounter->countWithShiftTables('some-string-to-be-counted'); ``` which returns @@ -26,7 +27,7 @@ stdClass Object ) ``` -You can sanitize your text to be a valid GSM 03.38 charset +You can sanitize your text to be a valid strict GSM 03.38 charset ```php use Instasent\SMSCounter\SMSCounter; @@ -35,6 +36,15 @@ $smsCounter = new SMSCounter(); $smsCounter->sanitizeToGSM('dadáó'); //return dadao ``` +#### National Language Shift Tables + +Starting release 8 of GSM 03.38 some additional charsets are allowed. This is the list of such National Language Shift Tables currently supported + +- [Turkish](https://en.wikipedia.org/wiki/GSM_03.38#Turkish_language_(Latin_script)) +- [Spanish](https://en.wikipedia.org/wiki/GSM_03.38#Spanish_language_(Latin_script)) +- [Portuguese](https://en.wikipedia.org/wiki/GSM_03.38#Portuguese_language_(Latin_script)) + + ## Installation `sms-counter-php` is available via [composer](http://getcomposer.org) on [packagist](https://packagist.org/packages/instasent/sms-counter-php). diff --git a/SMSCounter.php b/SMSCounter.php index 950d6b7..13a12e3 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -105,18 +105,110 @@ public function getGsm7bitExMap() ); } + public function getTurkishGsm7bitMap() + { + return [ + 10, 12, 13, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, + 116, 117, 118, 119, 120, 121, 122, + 123, 124, 125, 126, 163, 164, 165, + 167, 196, 197, 199, 201, 209, 214, + 220, 223, 224, 228, 229, 231, 233, + 241, 242, 246, 249, 252, 286, 287, + 304, 305, 350, 351, 915, 916, 920, + 923, 926, 928, 931, 934, 936, 937, + 8364, + ]; + } + + public function getAddedTurkishGsm7bitExMap() + { + return [12, 91, 92, 93, 94, 123, 124, 125, 126, 286, 287, 304, 305, 350, 351, 8364]; + } + + public function getAddedSpanishGsm7bitExMap() + { + return [12, 91, 92, 93, 94, 123, 124, 125, 126, 193, 205, 211, 218, 225, 231, 237, 243, 250, 8364]; + } + + public function getPortugueseGsm7bitMap() + { + return [ + 10, 12, 13, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 163, 165, 167, 170, 186, 192, + 193, 194, 195, 199, 201, 202, 205, 211, + 212, 213, 218, 220, 224, 225, 226, 227, + 231, 233, 234, 237, 242, 243, 244, 245, + 250, 252, 915, 916, 920, 928, 931, 934, + 936, 937, 8364, 8734, + ]; + } + + public function getAddedPortugueseGsm7bitExMap() + { + return [ + 12, 91, 92, 93, 94, 123, 124, 125, + 126, 193, 194, 195, 202, 205, 211, 212, + 213, 218, 225, 226, 227, 231, 234, 237, + 242, 243, 245, 250, 915, 920, 928, 931, + 934, 936, 937, 8364, + ]; + } + /** * Detects the encoding, Counts the characters, message length, remaining characters. * * @return \stdClass Object with params encoding,length, per_message, remaining, messages */ public function count($text) + { + return $this->doCount($text, false); + } + + /** + * Detects the encoding, Counts the characters, message length, remaining characters. + * Supports language shift tables characters. + * + * @return \stdClass Object with params encoding,length, per_message, remaining, messages + */ + public function countWithShiftTables($text) + { + return $this->doCount($text, true); + } + + /** + * @return \stdClass Object with params encoding,length, per_message, remaining, messages + */ + private function doCount($text, $supportShiftTables) { $unicodeArray = $this->utf8ToUnicode($text); // variable to catch if any ex chars while encoding detection. $exChars = []; - $encoding = $this->detectEncoding($unicodeArray, $exChars); + $encoding = $supportShiftTables + ? $this->detectEncodingWithShiftTables($text, $exChars) + : $this->detectEncoding($text, $exChars); + $length = count($unicodeArray); if ($encoding === self::GSM_7BIT_EX) { @@ -173,17 +265,56 @@ public function count($text) public function detectEncoding($text, &$exChars) { if (!is_array($text)) { - $text = self::utf8ToUnicode($text); + $text = $this->utf8ToUnicode($text); } $utf16Chars = array_diff($text, $this->getGsm7bitExMap()); - if (count($utf16Chars)) { return self::UTF16; } $exChars = array_intersect($text, $this->getAddedGsm7bitExMap()); + if (count($exChars)) { + return self::GSM_7BIT_EX; + } + + return self::GSM_7BIT; + } + /** + * Detects the encoding of a particular text. + * Supports language shift tables characters. + * + * @return string (GSM_7BIT|GSM_7BIT_EX|UTF16) + */ + public function detectEncodingWithShiftTables($text, &$exChars) + { + if (!is_array($text)) { + $text = $this->utf8ToUnicode($text); + } + + $gsmCharMap = array_merge( + $this->getGsm7bitExMap(), + $this->getTurkishGsm7bitMap(), + $this->getAddedTurkishGsm7bitExMap(), + $this->getAddedSpanishGsm7bitExMap(), + $this->getPortugueseGsm7bitMap(), + $this->getAddedPortugueseGsm7bitExMap() + ); + + $utf16Chars = array_diff($text, $gsmCharMap); + if (count($utf16Chars)) { + return self::UTF16; + } + + $addedGsmCharMap = array_merge( + $this->getAddedGsm7bitExMap(), + $this->getAddedTurkishGsm7bitExMap(), + $this->getAddedSpanishGsm7bitExMap(), + $this->getAddedPortugueseGsm7bitExMap() + ); + + $exChars = array_intersect($text, $addedGsmCharMap); if (count($exChars)) { return self::GSM_7BIT_EX; } @@ -296,7 +427,7 @@ public function removeNonGsmChars($str) public function replaceNonGsmChars($str, $replacement = null) { $validChars = $this->getGsm7bitExMap(); - $allChars = self::utf8ToUnicode($str); + $allChars = $this->utf8ToUnicode($str); if (strlen($replacement) > 1) { return false; @@ -525,27 +656,50 @@ public function removeAccents($str) * the encoding an multipart limits to apply the truncate. * * @param string $str Message text - * @param int $messages Number of SMS allowed + * @param int $limitSms Number of SMS allowed * * @return string Truncated message */ public function truncate($str, $limitSms) { - $count = $this->count($str); + return $this->doTruncate($str, $limitSms, false); + } + + /** + * Truncated to the limit of chars allowed by number of SMS. It will detect + * the encoding an multipart limits to apply the truncate. + * Supports language shift tables characters. + * + * @param string $str Message text + * @param int $limitSms Number of SMS allowed + * + * @return string Truncated message + */ + public function truncateWithShiftTables($str, $limitSms) + { + return $this->doTruncate($str, $limitSms, true); + } + + /** + * @return string Truncated message + */ + private function doTruncate($str, $limitSms, $supportShiftTables) + { + $count = $supportShiftTables + ? $this->countWithShiftTables($str) + : $this->count($str); if ($count->messages <= $limitSms) { return $str; } - if ($count->encoding == 'UTF16') { + if ($count->encoding === 'UTF16') { $limit = self::UTF16_LEN; if ($limitSms > 2) { $limit = self::UTF16_LEN_MULTIPART; } - } - - if ($count->encoding != 'UTF16') { + } else { $limit = self::GSM_7BIT_LEN; if ($limitSms > 2) { @@ -555,7 +709,9 @@ public function truncate($str, $limitSms) do { $str = mb_substr($str, 0, $limit * $limitSms); - $count = $this->count($str); + $count = $supportShiftTables + ? $this->countWithShiftTables($str) + : $this->count($str); $limit = $limit - 1; } while ($count->messages > $limitSms); From dda8edf8a38c8db805584714203a50f198036979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Thu, 23 May 2019 17:26:11 +0200 Subject: [PATCH 2/3] test coverage --- Tests/SMSCounterTest.php | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 9376bc0..93695ca 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -24,6 +24,57 @@ public function testGSM() $this->assertEquals($expected, $count); } + public function testGSM_TR() + { + $text = 'a GSM TR ç Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + + public function testGSM_ES() + { + $text = 'a GSM ES Ú Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + + public function testGSM_PT() + { + $text = 'a GSM PT Ã Text'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->countWithShiftTables($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::GSM_7BIT_EX; + $expected->length = 16; + $expected->per_message = 160; + $expected->remaining = 144; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + public function testGSMSymbols() { $text = 'a GSM +Text'; @@ -162,6 +213,17 @@ public function testTruncate1SmsGSM7() $this->assertEquals($expectedTExt, $output); } + public function testTruncate1SmsGSM7ShiftTable() + { + $text = 'ÚLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem.'; + $expectedTExt = 'ÚLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturien'; + + $smsCounter = new SMSCounter(); + $output = $smsCounter->truncateWithShiftTables($text, 1); + + $this->assertEquals($expectedTExt, $output); + } + public function testTruncate2SmsGSM7() { $text = 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient'; @@ -173,6 +235,17 @@ public function testTruncate2SmsGSM7() $this->assertEquals($expectedTExt, $output); } + public function testTruncate2SmsGSM7ShiftTable() + { + $text = 'çLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturie'; + $expectedTExt = 'çLorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magni'; + + $smsCounter = new SMSCounter(); + $output = $smsCounter->truncateWithShiftTables($text, 2); + + $this->assertEquals($expectedTExt, $output); + } + public function testTruncate1SmsUnicode() { $text = 'Snowman shows off! ☃ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa'; From 83f0cfadaaaee67ff1b715eefabd15ba87873207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Mon, 27 May 2019 15:09:59 +0200 Subject: [PATCH 3/3] fix unicode chars over 0xF0 --- SMSCounter.php | 14 ++++++++++---- Tests/SMSCounterTest.php | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/SMSCounter.php b/SMSCounter.php index 13a12e3..3063618 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -342,14 +342,20 @@ public function utf8ToUnicode($str) } if ($thisValue >= 128) { - if (count($values) == 0) { - $lookingFor = ($thisValue < 224) ? 2 : 3; + if (count($values) === 0) { + $lookingFor = 2; + + if ($thisValue >= 240) { + $lookingFor = 4; + } elseif ($thisValue >= 224) { + $lookingFor = 3; + } } $values[] = $thisValue; - if (count($values) == $lookingFor) { - $number = ($lookingFor == 3) ? + if (count($values) === $lookingFor) { + $number = ($lookingFor === 3) ? (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : (($values[0] % 32) * 64) + ($values[1] % 64); diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 93695ca..242c2d8 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -180,6 +180,23 @@ public function testUnicode() $this->assertEquals($expected, $count); } + public function testUnicodeEmoji() + { + $text = '😎😎'; + + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 2; + $expected->per_message = 70; + $expected->remaining = 68; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + public function testRemoveNonGSMChars() { $text = 'áno-unicode-remaining` ñ';