Skip to content

Commit e8e4542

Browse files
committed
Add EncodingService test cases
Signed-off-by: Raul <[email protected]>
1 parent d12cae5 commit e8e4542

File tree

10 files changed

+98
-93
lines changed

10 files changed

+98
-93
lines changed

composer.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"christophwurst/nextcloud": "dev-stable23",
77
"jakub-onderka/php-parallel-lint": "^1.0.0",
88
"nextcloud/coding-standard": "^0.5.0",
9-
"psalm/phar": "^4.3"
9+
"psalm/phar": "^4.3",
10+
"ext-mbstring": "*"
1011
},
1112
"license": "AGPLv3",
1213
"authors": [
@@ -19,7 +20,6 @@
1920
"lint": "find . -name \\*.php -not -path './vendor/*' -print0 | xargs -0 -n1 php -l",
2021
"cs:check": "php-cs-fixer fix --dry-run --diff",
2122
"cs:fix": "php-cs-fixer fix",
22-
"psalm": "psalm.phar",
23-
"test:unit": "phpunit -c tests/phpunit.xml"
23+
"psalm": "psalm.phar"
2424
}
2525
}

composer.lock

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/Service/EncodingService.php

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,15 @@
2626
namespace OCA\Text\Service;
2727

2828
class EncodingService {
29-
public const COMMON_ENCODINGS = ['UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];
30-
31-
public const UTF_BOMs = [
32-
'UTF-32BE' => "\x00\x00\xfe\xff",
33-
'UTF-32LE' => "\xff\xfe\x00\x00",
34-
'UTF-16BE' => "\xfe\xff",
35-
'UTF-16LE' => "\xff\xfe",
36-
'UTF-8' => "\xef\xbb\xbf"
37-
];
29+
public const COMMON_ENCODINGS = [ 'UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];
3830

31+
public const UTF_BOMs = [
32+
'UTF-32BE' => "\x00\x00\xfe\xff",
33+
'UTF-32LE' => "\xff\xfe\x00\x00",
34+
'UTF-16BE' => "\xfe\xff",
35+
'UTF-16LE' => "\xff\xfe",
36+
'UTF-8' => "\xef\xbb\xbf"
37+
];
3938

4039
public function encodeToUtf8(string $string): ?string {
4140
$encoding = $this->detectEncoding($string);
@@ -47,37 +46,36 @@ public function encodeToUtf8(string $string): ?string {
4746
}
4847

4948
public function detectEncoding(string $string): ?string {
50-
$bom_detect = $this->detectUtfBom($string);
51-
if ($bom_detect) {
52-
return $bom_detect;
53-
}
49+
$bomDetect = $this->detectUtfBom($string);
50+
if ($bomDetect) {
51+
return $bomDetect;
52+
}
5453

55-
$encodings = $this->getEncodings();
56-
foreach ($encodings as $encoding) {
54+
foreach ($this->getEncodings() as $encoding) {
5755
if (mb_check_encoding($string, $encoding)) {
5856
return $encoding;
5957
}
6058
}
6159

62-
return null;
60+
return mb_detect_encoding($string, $this->getEncodings(), true) ?: null;
6361
}
6462

65-
public function detectUtfBom(string $string): ?string {
66-
foreach (self::UTF_BOMs as $encoding => $utf_bom) {
67-
$bom = substr($string, 0, strlen($utf_bom));
68-
if ($bom === $utf_bom) {
69-
return $encoding;
70-
}
71-
}
63+
private function detectUtfBom(string $string): ?string {
64+
foreach (self::UTF_BOMs as $encoding => $utfBom) {
65+
$bom = substr($string, 0, strlen($utfBom));
66+
if ($bom === $utfBom) {
67+
return $encoding;
68+
}
69+
}
7270

73-
return null;
74-
}
71+
return null;
72+
}
7573

7674
/**
7775
* @return string[]
7876
*/
7977
private function getEncodings(): array {
80-
$mb_order = mb_detect_order() ?: [];
81-
return array_merge($mb_order, self::COMMON_ENCODINGS);
78+
$mbOrder = mb_detect_order() ?: [];
79+
return array_merge($mbOrder, self::COMMON_ENCODINGS);
8280
}
8381
}

tests/data/big5.txt

Lines changed: 17 additions & 23 deletions
Large diffs are not rendered by default.

tests/data/cp936.txt

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/data/gbk.txt

Lines changed: 0 additions & 22 deletions
This file was deleted.

tests/data/iso-8859-5.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
������ 1
2+
��� ���� ��������� ���������� � ������� � ����� ����������� � ������. ��� �������� ������� � �������� � ������ ��������� � ��������� ���� ����� � ���� ��������.
3+
4+
������ 2
5+
������ ������� ������ �������� ����� ������� � ����� ���������, ���������������� ��������� �����������, ��� ������ �� �� �� ���� ��������, ���-�� � ��������� ����, ����� ����, ����, �����, �������, ������������ ��� ���� ���������, ������������� ��� ����������� �������������, ��������������, ���������� ��� ����� ���������.
6+
7+
����� ����, �� ������ ����������� �������� �������� �� ������ �������������, ��������� ��� �������������� ������� ������ ��� ����������, � ������� ������� �����������, ���������� �� ����, �������� �� ��� ���������� �����������, ����������, ������������������� ��� ���-���� ����� ������������ � ����� ������������.
8+
9+
������ 3
10+
������ ������� ����� ����� �� �����, �� ������� � �� ������ ������������������.
11+
12+

tests/data/utf-16.txt

1.4 KB
Binary file not shown.

tests/unit/Service/EncodingServiceTest.php

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,42 +15,62 @@ protected function setUp(): void {
1515
/**
1616
* Attempt to decode the file using the default decoding order.
1717
* For files with encodings not included in the COMMON_ENCODINGS array encoding to UTF-8 will fail.
18+
* We then do the conversion again after setting the mb_detect_order value: all conversions should succeed.
1819
* @dataProvider dataFileEncodings
1920
*/
20-
public function testDefault(string $file, string $encoding) {
21-
$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
21+
public function testEncoding(string $file, string $encoding) {
22+
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
2223

2324
// If encoding is not part of the default encodings we can expect it to fail
2425
// It might still succeed because encoding detection is not precise.
25-
if (!$utf8_string && !in_array($encoding, EncodingService::COMMON_ENCODINGS, true)) {
26-
return;
26+
if ($utf8String || $this->isSupportedEncoding($encoding)) {
27+
$this->assertNotNull($utf8String);
28+
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
2729
}
2830

29-
$this->assertNotNull($utf8_string);
30-
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
31+
$originalOrder = mb_detect_order();
32+
$this->assertNotFalse(mb_detect_order($encoding));
33+
34+
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
35+
$this->assertNotNull($utf8String);
36+
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
37+
38+
mb_detect_order($originalOrder);
3139
}
3240

3341
/**
34-
* Includes the encoding of the file in the detection order config value.
35-
* This means that all files should be successfully encoded to UTF-8.
42+
* If the encoding is in the list of common encodings we should be able to detect an encoding (it might not be the
43+
* correct encoding due to detection inaccuracies). If not, add the encoding to mb_detect_order.
3644
* @dataProvider dataFileEncodings
3745
*/
38-
public function testCustomOrder(string $file, string $encoding) {
39-
$original_order = mb_detect_order();
46+
public function testDetection(string $file, string $encoding) {
47+
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
48+
if ($this->isSupportedEncoding($encoding)) {
49+
$this->assertNotNull($detectedEncoding);
50+
}
51+
52+
$originalOrder = mb_detect_order();
4053
$this->assertNotFalse(mb_detect_order($encoding));
4154

42-
$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
43-
$this->assertNotNull($utf8_string);
44-
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
55+
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
56+
$this->assertEquals($encoding, $detectedEncoding);
4557

46-
mb_detect_order($original_order);
58+
mb_detect_order($originalOrder);
4759
}
4860

61+
4962
public function dataFileEncodings(): array {
5063
return [
51-
['./tests/data/iso-8859.txt', 'ISO-8859-1'],
64+
['./tests/data/iso-8859-15.txt', 'ISO-8859-15'],
5265
['./tests/data/big5.txt', 'BIG-5'],
53-
['./tests/data/gbk.txt', 'GBK']
66+
['./tests/data/cp936.txt', 'CP936'],
67+
['./tests/data/utf-16.txt', 'UTF-16LE'],
68+
['./tests/data/iso-8859-5.txt', 'ISO-8859-5'],
5469
];
5570
}
71+
72+
private function isSupportedEncoding(string $encoding): bool {
73+
return in_array($encoding, EncodingService::COMMON_ENCODINGS, true)
74+
|| isset(EncodingService::UTF_BOMs[$encoding]);
75+
}
5676
}

0 commit comments

Comments
 (0)