Skip to content

Commit 7864e72

Browse files
authored
Merge pull request #2416 from nextcloud/backport/2385/stable24
[stable24] Add support for different encodings
2 parents a716916 + d17014c commit 7864e72

File tree

10 files changed

+2711
-572
lines changed

10 files changed

+2711
-572
lines changed

composer.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
"christophwurst/nextcloud": "dev-master",
77
"jakub-onderka/php-parallel-lint": "^1.0.0",
88
"nextcloud/coding-standard": "^1.0.0",
9-
"psalm/phar": "^4.3"
9+
"psalm/phar": "^4.3",
10+
"phpunit/phpunit": "^9.5",
11+
"ext-mbstring": "*"
1012
},
1113
"license": "AGPLv3",
1214
"authors": [
@@ -22,6 +24,7 @@
2224
"lint": "find . -name \\*.php -not -path './vendor/*' -print0 | xargs -0 -n1 php -l",
2325
"cs:check": "php-cs-fixer fix --dry-run --diff",
2426
"cs:fix": "php-cs-fixer fix",
25-
"psalm": "psalm.phar"
27+
"psalm": "psalm.phar",
28+
"test:unit": "phpunit -c tests/phpunit.xml"
2629
}
2730
}

composer.lock

Lines changed: 2498 additions & 570 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/Service/ApiService.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,20 @@ class ApiService {
5050
protected $documentService;
5151
protected $logger;
5252
private $imageService;
53+
private $encodingService;
5354

5455
public function __construct(IRequest $request,
5556
SessionService $sessionService,
5657
DocumentService $documentService,
5758
ImageService $imageService,
59+
EncodingService $encodingService,
5860
ILogger $logger) {
5961
$this->request = $request;
6062
$this->sessionService = $sessionService;
6163
$this->documentService = $documentService;
6264
$this->logger = $logger;
6365
$this->imageService = $imageService;
66+
$this->encodingService = $encodingService;
6467
}
6568

6669
public function create($fileId = null, $filePath = null, $token = null, $guestName = null, bool $forceRecreate = false): DataResponse {
@@ -105,6 +108,11 @@ public function create($fileId = null, $filePath = null, $token = null, $guestNa
105108
try {
106109
$baseFile = $this->documentService->getBaseFile($document->getId());
107110
$content = $baseFile->getContent();
111+
112+
$content = $this->encodingService->encodeToUtf8($content);
113+
if (!$content) {
114+
$this->logger->log(ILogger::WARN, 'Failed to encode file to UTF8. File ID: ' . $file->getId());
115+
}
108116
} catch (NotFoundException $e) {
109117
$this->logger->logException($e, ['level' => ILogger::INFO]);
110118
$content = null;

lib/Service/EncodingService.php

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/**
6+
* @copyright Copyright (c) 2022 Raul Ferreira Fuentes <raul@nextcloud.com>
7+
*
8+
* @author Raul Ferreira Fuentes <raul@nextcloud.com>
9+
*
10+
* @license GNU AGPL version 3 or any later version
11+
*
12+
* This program is free software: you can redistribute it and/or modify
13+
* it under the terms of the GNU Affero General Public License as
14+
* published by the Free Software Foundation, either version 3 of the
15+
* License, or (at your option) any later version.
16+
*
17+
* This program is distributed in the hope that it will be useful,
18+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+
* GNU Affero General Public License for more details.
21+
*
22+
* You should have received a copy of the GNU Affero General Public License
23+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
24+
*
25+
*/
26+
namespace OCA\Text\Service;
27+
28+
class EncodingService {
29+
public const COMMON_ENCODINGS = [ 'UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];
30+
31+
public const UTF_BOMs = [
32+
'UTF-32BE' => "\x00\x00\xfe\xff",
33+
'UTF-32LE' => "\xff\xfe\x00\x00",
34+
'UTF-16BE' => "\xfe\xff",
35+
'UTF-16LE' => "\xff\xfe",
36+
'UTF-8' => "\xef\xbb\xbf"
37+
];
38+
39+
public function encodeToUtf8(string $string): ?string {
40+
$encoding = $this->detectEncoding($string);
41+
if (!$encoding) {
42+
return null;
43+
}
44+
45+
return mb_convert_encoding($string, 'UTF-8', $encoding);
46+
}
47+
48+
public function detectEncoding(string $string): ?string {
49+
$bomDetect = $this->detectUtfBom($string);
50+
if ($bomDetect) {
51+
return $bomDetect;
52+
}
53+
54+
foreach ($this->getEncodings() as $encoding) {
55+
if (mb_check_encoding($string, $encoding)) {
56+
return $encoding;
57+
}
58+
}
59+
60+
return mb_detect_encoding($string, $this->getEncodings(), true) ?: null;
61+
}
62+
63+
private function detectUtfBom(string $string): ?string {
64+
foreach (self::UTF_BOMs as $encoding => $utfBom) {
65+
$bom = substr($string, 0, strlen($utfBom));
66+
if ($bom === $utfBom) {
67+
return $encoding;
68+
}
69+
}
70+
71+
return null;
72+
}
73+
74+
/**
75+
* @return string[]
76+
*/
77+
private function getEncodings(): array {
78+
$mbOrder = mb_detect_order() ?: [];
79+
return array_merge($mbOrder, self::COMMON_ENCODINGS);
80+
}
81+
}

tests/data/big5.txt

Lines changed: 24 additions & 0 deletions
Large diffs are not rendered by default.

tests/data/cp936.txt

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/data/iso-8859-15.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de raz�o e de consci�ncia, devem agir uns para com os outros em esp�rito de fraternidade.
2+
3+
Alle menneske er f�dde til fridom og med same menneskeverd og menneskerettar. Dei har f�tt fornuft og samvit og skal leve med kvarandre som br�r.
4+
5+
Tous les �tres humains naissent libres et �gaux en dignit� et en droits. Ils sont dou�s de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternit�.
6+

tests/data/iso-8859-5.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
������ 1
2+
��� ���� ��������� ���������� � ������� � ����� ����������� � ������. ��� �������� ������� � �������� � ������ ��������� � ��������� ���� ����� � ���� ��������.
3+
4+
������ 2
5+
������ ������� ������ �������� ����� ������� � ����� ���������, ���������������� ��������� �����������, ��� ������ �� �� �� ���� ��������, ���-�� � ��������� ����, ����� ����, ����, �����, �������, ������������ ��� ���� ���������, ������������� ��� ����������� �������������, ��������������, ���������� ��� ����� ���������.
6+
7+
����� ����, �� ������ ����������� �������� �������� �� ������ �������������, ��������� ��� �������������� ������� ������ ��� ����������, � ������� ������� �����������, ���������� �� ����, �������� �� ��� ���������� �����������, ����������, ������������������� ��� ���-���� ����� ������������ � ����� ������������.
8+
9+
������ 3
10+
������ ������� ����� ����� �� �����, �� ������� � �� ������ ������������������.
11+
12+

tests/data/utf-16.txt

1.4 KB
Binary file not shown.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
<?php
2+
3+
namespace OCA\Text\Service;
4+
5+
use Test\TestCase;
6+
7+
class LabelServiceTest extends TestCase {
8+
private $encodingService;
9+
10+
protected function setUp(): void {
11+
parent::setUp();
12+
$this->encodingService = new EncodingService();
13+
}
14+
15+
/**
16+
* Attempt to decode the file using the default decoding order.
17+
* For files with encodings not included in the COMMON_ENCODINGS array encoding to UTF-8 will fail.
18+
* We then do the conversion again after setting the mb_detect_order value: all conversions should succeed.
19+
* @dataProvider dataFileEncodings
20+
*/
21+
public function testEncoding(string $file, string $encoding) {
22+
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
23+
24+
// If encoding is not part of the default encodings we can expect it to fail
25+
// It might still succeed because encoding detection is not precise.
26+
if ($utf8String || $this->isSupportedEncoding($encoding)) {
27+
$this->assertNotNull($utf8String);
28+
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
29+
}
30+
31+
$originalOrder = mb_detect_order();
32+
$this->assertNotFalse(mb_detect_order($encoding));
33+
34+
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
35+
$this->assertNotNull($utf8String);
36+
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
37+
38+
mb_detect_order($originalOrder);
39+
}
40+
41+
/**
42+
* If the encoding is in the list of common encodings we should be able to detect an encoding (it might not be the
43+
* correct encoding due to detection inaccuracies). If not, add the encoding to mb_detect_order.
44+
* @dataProvider dataFileEncodings
45+
*/
46+
public function testDetection(string $file, string $encoding) {
47+
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
48+
if ($this->isSupportedEncoding($encoding)) {
49+
$this->assertNotNull($detectedEncoding);
50+
}
51+
52+
$originalOrder = mb_detect_order();
53+
$this->assertNotFalse(mb_detect_order($encoding));
54+
55+
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
56+
$this->assertEquals($encoding, $detectedEncoding);
57+
58+
mb_detect_order($originalOrder);
59+
}
60+
61+
62+
public function dataFileEncodings(): array {
63+
return [
64+
['./tests/data/iso-8859-15.txt', 'ISO-8859-15'],
65+
['./tests/data/big5.txt', 'BIG-5'],
66+
['./tests/data/cp936.txt', 'CP936'],
67+
['./tests/data/utf-16.txt', 'UTF-16LE'],
68+
['./tests/data/iso-8859-5.txt', 'ISO-8859-5'],
69+
];
70+
}
71+
72+
private function isSupportedEncoding(string $encoding): bool {
73+
return in_array($encoding, EncodingService::COMMON_ENCODINGS, true)
74+
|| isset(EncodingService::UTF_BOMs[$encoding]);
75+
}
76+
}

0 commit comments

Comments
 (0)