Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
#80: Basic HTML reader
  • Loading branch information
ivanlanin committed May 30, 2014
commit ec85d7d641dc7f6a2f75f0ae81279b0c1c5d23a7
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This is the changelog between releases of PHPWord. Releases are listed in revers

## 0.11.0 - Not yet released

This release marked the change of PHPWord license from LGPL 2.1 to LGPL 3. Four new elements were added: TextBox, ListItemRun, Field, and Line. Relative and absolute positioning for images and textboxes were added. Writer classes were refactored into parts, elements, and styles. ODT and RTF features were enhanced. Ability to add elements to PHPWord object via HTML were implemeted. RTF reader were initiated.
This release marked the change of PHPWord license from LGPL 2.1 to LGPL 3. Four new elements were added: TextBox, ListItemRun, Field, and Line. Relative and absolute positioning for images and textboxes were added. Writer classes were refactored into parts, elements, and styles. ODT and RTF features were enhanced. Ability to add elements to PHPWord object via HTML were implemeted. RTF and HTML reader were initiated.

### Features

Expand Down Expand Up @@ -33,6 +33,7 @@ This release marked the change of PHPWord license from LGPL 2.1 to LGPL 3. Four
- RTF Reader: Basic RTF reader - @ivanlanin GH-72 GH-252
- Element: New `Line` element - @basjan GH-253
- Title: Ability to apply numbering in heading - @ivanlanin GH-193
- HTML Reader: Basic HTML reader - @ivanlanin GH-80

### Bugfixes

Expand Down
15 changes: 15 additions & 0 deletions samples/Sample_30_ReadHTML.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?php
include_once 'Sample_Header.php';

// Read contents
$name = basename(__FILE__, '.php');
$source = realpath(__DIR__ . "/resources/{$name}.html");

echo date('H:i:s'), " Reading contents from `{$source}`", EOL;
$phpWord = \PhpOffice\PhpWord\IOFactory::load($source, 'HTML');

// Save file
echo write($phpWord, basename(__FILE__, '.php'), $writers);
if (!CLI) {
include_once 'Sample_Footer.php';
}
15 changes: 15 additions & 0 deletions samples/resources/Sample_30_ReadHTML.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<html>
<head>
<meta charset="UTF-8" />
<title>PHPWord</title>
</head>
<body>
<h1>Adding element via HTML</h1>
<p>Some well formed HTML snippet needs to be used</p>
<p>With for example <strong>some<sup>1</sup> <em>inline</em> formatting</strong><sub>1</sub></p>
<p>Unordered (bulleted) list:</p>
<ul><li>Item 1</li><li>Item 2</li><ul><li>Item 2.1</li><li>Item 2.1</li></ul></ul>
<p>Ordered (numbered) list:</p>
<ol><li>Item 1</li><li>Item 2</li></ol>
</body>
</html>
2 changes: 1 addition & 1 deletion src/PhpWord/IOFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public static function createWriter(PhpWord $phpWord, $name = 'Word2007')
*/
public static function createReader($name = 'Word2007')
{
if (!in_array($name, array('ReaderInterface', 'Word2007', 'ODText', 'RTF'))) {
if (!in_array($name, array('ReaderInterface', 'Word2007', 'ODText', 'RTF', 'HTML'))) {
throw new Exception("\"{$name}\" is not a valid reader.");
}

Expand Down
50 changes: 50 additions & 0 deletions src/PhpWord/Reader/HTML.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<?php
/**
* This file is part of PHPWord - A pure PHP library for reading and writing
* word processing documents.
*
* PHPWord is free software distributed under the terms of the GNU Lesser
* General Public License version 3 as published by the Free Software Foundation.
*
* For the full copyright and license information, please read the LICENSE
* file that was distributed with this source code. For the full list of
* contributors, visit https://github.com/PHPOffice/PHPWord/contributors.
*
* @link https://github.com/PHPOffice/PHPWord
* @copyright 2010-2014 PHPWord contributors
* @license http://www.gnu.org/licenses/lgpl.txt LGPL version 3
*/

namespace PhpOffice\PhpWord\Reader;

use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Shared\Html as HTMLParser;

/**
* HTML Reader class
*
* @since 0.11.0
*/
class HTML extends AbstractReader implements ReaderInterface
{
/**
* Loads PhpWord from file
*
* @param string $docFile
* @throws \Exception
* @return \PhpOffice\PhpWord\PhpWord
*/
public function load($docFile)
{
$phpWord = new PhpWord();

if ($this->canRead($docFile)) {
$section = $phpWord->addSection();
HTMLParser::addHtml($section, file_get_contents($docFile), true);
} else {
throw new \Exception("Cannot read {$docFile}.");
}

return $phpWord;
}
}
15 changes: 11 additions & 4 deletions src/PhpWord/Shared/Html.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,27 @@ class Html
* Note: $stylesheet parameter is removed to avoid PHPMD error for unused parameter
*
* @param \PhpOffice\PhpWord\Element\AbstractContainer $element Where the parts need to be added
* @param string $html the code to parse
* @param string $html The code to parse
* @param bool $fullHTML If it's a full HTML, no need to add 'body' tag
*/
public static function addHtml($element, $html)
public static function addHtml($element, $html, $fullHTML = false)
{
/*
* @todo parse $stylesheet for default styles. Should result in an array based on id, class and element,
* which could be applied when such an element occurs in the parseNode function.
*/

// Preprocess: remove all line ends, decode HTML entity, and add body tag for HTML fragments
$html = str_replace(array("\n", "\r"), '', $html);
$html = html_entity_decode($html);
if ($fullHTML === false) {
$html = '<body>' . $html . '</body>';
}

// Load DOM
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = true;
$dom->loadXML('<body>' . html_entity_decode($html) . '</body>');

$dom->loadXML($html);
$node = $dom->getElementsByTagName('body');

self::parseNode($node->item(0), $element);
Expand Down
51 changes: 51 additions & 0 deletions tests/PhpWord/Tests/Reader/HTMLTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?php
/**
* This file is part of PHPWord - A pure PHP library for reading and writing
* word processing documents.
*
* PHPWord is free software distributed under the terms of the GNU Lesser
* General Public License version 3 as published by the Free Software Foundation.
*
* For the full copyright and license information, please read the LICENSE
* file that was distributed with this source code. For the full list of
* contributors, visit https://github.com/PHPOffice/PHPWord/contributors.
*
* @link https://github.com/PHPOffice/PHPWord
* @copyright 2010-2014 PHPWord contributors
* @license http://www.gnu.org/licenses/lgpl.txt LGPL version 3
*/

namespace PhpOffice\PhpWord\Tests\Reader;

use PhpOffice\PhpWord\IOFactory;

/**
* Test class for PhpOffice\PhpWord\Reader\HTML
*
* @coversDefaultClass \PhpOffice\PhpWord\Reader\HTML
* @runTestsInSeparateProcesses
*/
class HTMLTest extends \PHPUnit_Framework_TestCase
{
/**
* Test load
*/
public function testLoad()
{
$filename = __DIR__ . '/../_files/documents/reader.html';
$phpWord = IOFactory::load($filename, 'HTML');
$this->assertInstanceOf('PhpOffice\\PhpWord\\PhpWord', $phpWord);
}

/**
* Test load exception
*
* @expectedException \Exception
* @expectedExceptionMessage Cannot read
*/
public function testLoadException()
{
$filename = __DIR__ . '/../_files/documents/foo.html';
IOFactory::load($filename, 'HTML');
}
}
15 changes: 15 additions & 0 deletions tests/PhpWord/Tests/_files/documents/reader.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<html>
<head>
<meta charset="UTF-8" />
<title>PHPWord</title>
</head>
<body>
<h1>Adding element via HTML</h1>
<p>Some well formed HTML snippet needs to be used</p>
<p>With for example <strong>some<sup>1</sup> <em>inline</em> formatting</strong><sub>1</sub></p>
<p>Unordered (bulleted) list:</p>
<ul><li>Item 1</li><li>Item 2</li><ul><li>Item 2.1</li><li>Item 2.1</li></ul></ul>
<p>Ordered (numbered) list:</p>
<ol><li>Item 1</li><li>Item 2</li></ol>
</body>
</html>