Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add EncodingService test cases
Signed-off-by: Raul <[email protected]>
  • Loading branch information
Raudius authored and mejo- committed Jun 9, 2022
commit 9f1b1be2ecb9a80ead6efda20263699254ae684a
52 changes: 25 additions & 27 deletions lib/Service/EncodingService.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,15 @@
namespace OCA\Text\Service;

class EncodingService {
public const COMMON_ENCODINGS = ['UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];

public const UTF_BOMs = [
'UTF-32BE' => "\x00\x00\xfe\xff",
'UTF-32LE' => "\xff\xfe\x00\x00",
'UTF-16BE' => "\xfe\xff",
'UTF-16LE' => "\xff\xfe",
'UTF-8' => "\xef\xbb\xbf"
];
public const COMMON_ENCODINGS = [ 'UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];

public const UTF_BOMs = [
'UTF-32BE' => "\x00\x00\xfe\xff",
'UTF-32LE' => "\xff\xfe\x00\x00",
'UTF-16BE' => "\xfe\xff",
'UTF-16LE' => "\xff\xfe",
'UTF-8' => "\xef\xbb\xbf"
];

public function encodeToUtf8(string $string): ?string {
$encoding = $this->detectEncoding($string);
Expand All @@ -47,37 +46,36 @@ public function encodeToUtf8(string $string): ?string {
}

public function detectEncoding(string $string): ?string {
$bom_detect = $this->detectUtfBom($string);
if ($bom_detect) {
return $bom_detect;
}
$bomDetect = $this->detectUtfBom($string);
if ($bomDetect) {
return $bomDetect;
}

$encodings = $this->getEncodings();
foreach ($encodings as $encoding) {
foreach ($this->getEncodings() as $encoding) {
if (mb_check_encoding($string, $encoding)) {
return $encoding;
}
}

return null;
return mb_detect_encoding($string, $this->getEncodings(), true) ?: null;
}

public function detectUtfBom(string $string): ?string {
foreach (self::UTF_BOMs as $encoding => $utf_bom) {
$bom = substr($string, 0, strlen($utf_bom));
if ($bom === $utf_bom) {
return $encoding;
}
}
private function detectUtfBom(string $string): ?string {
foreach (self::UTF_BOMs as $encoding => $utfBom) {
$bom = substr($string, 0, strlen($utfBom));
if ($bom === $utfBom) {
return $encoding;
}
}

return null;
}
return null;
}

/**
* @return string[]
*/
private function getEncodings(): array {
$mb_order = mb_detect_order() ?: [];
return array_merge($mb_order, self::COMMON_ENCODINGS);
$mbOrder = mb_detect_order() ?: [];
return array_merge($mbOrder, self::COMMON_ENCODINGS);
}
}
40 changes: 17 additions & 23 deletions tests/data/big5.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/cp936.txt

Large diffs are not rendered by default.

22 changes: 0 additions & 22 deletions tests/data/gbk.txt

This file was deleted.

File renamed without changes.
12 changes: 12 additions & 0 deletions tests/data/iso-8859-5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
������ 1
��� ���� ��������� ���������� � ������� � ����� ����������� � ������. ��� �������� ������� � �������� � ������ ��������� � ��������� ���� ����� � ���� ��������.

������ 2
������ ������� ������ �������� ����� ������� � ����� ���������, ���������������� ��������� �����������, ��� ������ �� �� �� ���� ��������, ���-�� � ��������� ����, ����� ����, ����, �����, �������, ������������ ��� ���� ���������, ������������� ��� ����������� �������������, ��������������, ���������� ��� ����� ���������.

����� ����, �� ������ ����������� �������� �������� �� ������ �������������, ��������� ��� �������������� ������� ������ ��� ����������, � ������� ������� �����������, ���������� �� ����, �������� �� ��� ���������� �����������, ����������, ������������������� ��� ���-���� ����� ������������ � ����� ������������.

������ 3
������ ������� ����� ����� �� �����, �� ������� � �� ������ ������������������.


Binary file added tests/data/utf-16.txt
Binary file not shown.
52 changes: 36 additions & 16 deletions tests/unit/Service/EncodingServiceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,62 @@ protected function setUp(): void {
/**
* Attempt to decode the file using the default decoding order.
* For files with encodings not included in the COMMON_ENCODINGS array encoding to UTF-8 will fail.
* We then do the conversion again after setting the mb_detect_order value: all conversions should succeed.
* @dataProvider dataFileEncodings
*/
public function testDefault(string $file, string $encoding) {
$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
public function testEncoding(string $file, string $encoding) {
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));

// If encoding is not part of the default encodings we can expect it to fail
// It might still succeed because encoding detection is not precise.
if (!$utf8_string && !in_array($encoding, EncodingService::COMMON_ENCODINGS, true)) {
return;
if ($utf8String || $this->isSupportedEncoding($encoding)) {
$this->assertNotNull($utf8String);
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
}

$this->assertNotNull($utf8_string);
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
$originalOrder = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
$this->assertNotNull($utf8String);
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));

mb_detect_order($originalOrder);
}

/**
* Includes the encoding of the file in the detection order config value.
* This means that all files should be successfully encoded to UTF-8.
* If the encoding is in the list of common encodings we should be able to detect an encoding (it might not be the
* correct encoding due to detection inaccuracies). If not, add the encoding to mb_detect_order.
* @dataProvider dataFileEncodings
*/
public function testCustomOrder(string $file, string $encoding) {
$original_order = mb_detect_order();
public function testDetection(string $file, string $encoding) {
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
if ($this->isSupportedEncoding($encoding)) {
$this->assertNotNull($detectedEncoding);
}

$originalOrder = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
$this->assertNotNull($utf8_string);
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
$this->assertEquals($encoding, $detectedEncoding);

mb_detect_order($original_order);
mb_detect_order($originalOrder);
}


public function dataFileEncodings(): array {
return [
['./tests/data/iso-8859.txt', 'ISO-8859-1'],
['./tests/data/iso-8859-15.txt', 'ISO-8859-15'],
['./tests/data/big5.txt', 'BIG-5'],
['./tests/data/gbk.txt', 'GBK']
['./tests/data/cp936.txt', 'CP936'],
['./tests/data/utf-16.txt', 'UTF-16LE'],
['./tests/data/iso-8859-5.txt', 'ISO-8859-5'],
];
}

private function isSupportedEncoding(string $encoding): bool {
return in_array($encoding, EncodingService::COMMON_ENCODINGS, true)
|| isset(EncodingService::UTF_BOMs[$encoding]);
}
}