Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
"christophwurst/nextcloud": "dev-master",
"jakub-onderka/php-parallel-lint": "^1.0.0",
"nextcloud/coding-standard": "^1.0.0",
"psalm/phar": "^4.3"
"psalm/phar": "^4.3",
"phpunit/phpunit": "^9.5",
"ext-mbstring": "*"
},
"license": "AGPLv3",
"authors": [
Expand All @@ -22,6 +24,7 @@
"lint": "find . -name \\*.php -not -path './vendor/*' -print0 | xargs -0 -n1 php -l",
"cs:check": "php-cs-fixer fix --dry-run --diff",
"cs:fix": "php-cs-fixer fix",
"psalm": "psalm.phar"
"psalm": "psalm.phar",
"test:unit": "phpunit -c tests/phpunit.xml"
}
}
3,068 changes: 2,498 additions & 570 deletions composer.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions lib/Service/ApiService.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,20 @@ class ApiService {
protected $documentService;
protected $logger;
private $imageService;
private $encodingService;

public function __construct(IRequest $request,
SessionService $sessionService,
DocumentService $documentService,
ImageService $imageService,
EncodingService $encodingService,
ILogger $logger) {
$this->request = $request;
$this->sessionService = $sessionService;
$this->documentService = $documentService;
$this->logger = $logger;
$this->imageService = $imageService;
$this->encodingService = $encodingService;
}

public function create($fileId = null, $filePath = null, $token = null, $guestName = null, bool $forceRecreate = false): DataResponse {
Expand Down Expand Up @@ -105,6 +108,11 @@ public function create($fileId = null, $filePath = null, $token = null, $guestNa
try {
$baseFile = $this->documentService->getBaseFile($document->getId());
$content = $baseFile->getContent();

$content = $this->encodingService->encodeToUtf8($content);
if (!$content) {
$this->logger->log(ILogger::WARN, 'Failed to encode file to UTF8. File ID: ' . $file->getId());
}
} catch (NotFoundException $e) {
$this->logger->logException($e, ['level' => ILogger::INFO]);
$content = null;
Expand Down
81 changes: 81 additions & 0 deletions lib/Service/EncodingService.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2022 Raul Ferreira Fuentes <[email protected]>
*
* @author Raul Ferreira Fuentes <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
namespace OCA\Text\Service;

class EncodingService {
public const COMMON_ENCODINGS = [ 'UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];

public const UTF_BOMs = [
'UTF-32BE' => "\x00\x00\xfe\xff",
'UTF-32LE' => "\xff\xfe\x00\x00",
'UTF-16BE' => "\xfe\xff",
'UTF-16LE' => "\xff\xfe",
'UTF-8' => "\xef\xbb\xbf"
];

public function encodeToUtf8(string $string): ?string {
$encoding = $this->detectEncoding($string);
if (!$encoding) {
return null;
}

return mb_convert_encoding($string, 'UTF-8', $encoding);
}

public function detectEncoding(string $string): ?string {
$bomDetect = $this->detectUtfBom($string);
if ($bomDetect) {
return $bomDetect;
}

foreach ($this->getEncodings() as $encoding) {
if (mb_check_encoding($string, $encoding)) {
return $encoding;
}
}

return mb_detect_encoding($string, $this->getEncodings(), true) ?: null;
}

private function detectUtfBom(string $string): ?string {
foreach (self::UTF_BOMs as $encoding => $utfBom) {
$bom = substr($string, 0, strlen($utfBom));
if ($bom === $utfBom) {
return $encoding;
}
}

return null;
}

/**
* @return string[]
*/
private function getEncodings(): array {
$mbOrder = mb_detect_order() ?: [];
return array_merge($mbOrder, self::COMMON_ENCODINGS);
}
}
24 changes: 24 additions & 0 deletions tests/data/big5.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/cp936.txt

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions tests/data/iso-8859-15.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de raz�o e de consci�ncia, devem agir uns para com os outros em esp�rito de fraternidade.

Alle menneske er f�dde til fridom og med same menneskeverd og menneskerettar. Dei har f�tt fornuft og samvit og skal leve med kvarandre som br�r.

Tous les �tres humains naissent libres et �gaux en dignit� et en droits. Ils sont dou�s de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternit�.

12 changes: 12 additions & 0 deletions tests/data/iso-8859-5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
������ 1
��� ���� ��������� ���������� � ������� � ����� ����������� � ������. ��� �������� ������� � �������� � ������ ��������� � ��������� ���� ����� � ���� ��������.

������ 2
������ ������� ������ �������� ����� ������� � ����� ���������, ���������������� ��������� �����������, ��� ������ �� �� �� ���� ��������, ���-�� � ��������� ����, ����� ����, ����, �����, �������, ������������ ��� ���� ���������, ������������� ��� ����������� �������������, ��������������, ���������� ��� ����� ���������.

����� ����, �� ������ ����������� �������� �������� �� ������ �������������, ��������� ��� �������������� ������� ������ ��� ����������, � ������� ������� �����������, ���������� �� ����, �������� �� ��� ���������� �����������, ����������, ������������������� ��� ���-���� ����� ������������ � ����� ������������.

������ 3
������ ������� ����� ����� �� �����, �� ������� � �� ������ ������������������.


Binary file added tests/data/utf-16.txt
Binary file not shown.
76 changes: 76 additions & 0 deletions tests/unit/Service/EncodingServiceTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<?php

namespace OCA\Text\Service;

use Test\TestCase;

class LabelServiceTest extends TestCase {
private $encodingService;

protected function setUp(): void {
parent::setUp();
$this->encodingService = new EncodingService();
}

/**
* Attempt to decode the file using the default decoding order.
* For files with encodings not included in the COMMON_ENCODINGS array encoding to UTF-8 will fail.
* We then do the conversion again after setting the mb_detect_order value: all conversions should succeed.
* @dataProvider dataFileEncodings
*/
public function testEncoding(string $file, string $encoding) {
$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));

// If encoding is not part of the default encodings we can expect it to fail
// It might still succeed because encoding detection is not precise.
if ($utf8String || $this->isSupportedEncoding($encoding)) {
$this->assertNotNull($utf8String);
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));
}

$originalOrder = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$utf8String = $this->encodingService->encodeToUtf8(file_get_contents($file));
$this->assertNotNull($utf8String);
$this->assertNotFalse(mb_detect_encoding($utf8String, 'UTF-8', true));

mb_detect_order($originalOrder);
}

/**
* If the encoding is in the list of common encodings we should be able to detect an encoding (it might not be the
* correct encoding due to detection inaccuracies). If not, add the encoding to mb_detect_order.
* @dataProvider dataFileEncodings
*/
public function testDetection(string $file, string $encoding) {
$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
if ($this->isSupportedEncoding($encoding)) {
$this->assertNotNull($detectedEncoding);
}

$originalOrder = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$detectedEncoding = $this->encodingService->detectEncoding(file_get_contents($file));
$this->assertEquals($encoding, $detectedEncoding);

mb_detect_order($originalOrder);
}


public function dataFileEncodings(): array {
return [
['./tests/data/iso-8859-15.txt', 'ISO-8859-15'],
['./tests/data/big5.txt', 'BIG-5'],
['./tests/data/cp936.txt', 'CP936'],
['./tests/data/utf-16.txt', 'UTF-16LE'],
['./tests/data/iso-8859-5.txt', 'ISO-8859-5'],
];
}

private function isSupportedEncoding(string $encoding): bool {
return in_array($encoding, EncodingService::COMMON_ENCODINGS, true)
|| isset(EncodingService::UTF_BOMs[$encoding]);
}
}