From 39f7058e5a67d01b2fe8e36445bccef52328a3ab Mon Sep 17 00:00:00 2001 From: Arnout Boks Date: Sun, 27 Oct 2013 16:20:57 +0100 Subject: [PATCH] Decoupled tokenizer from detection strategy This opens up possibilities to swap the default token_get_all()- based tokenizer for another one, or to add tokenizers for other programming languages. --- build/package.xml | 9 ++- src/CLI/Command.php | 8 ++- src/Detector/Strategy/Default.php | 26 ++++++-- src/Detector/Tokenizer.php | 68 ++++++++++++++++++++ src/Detector/Tokenizer/PHP.php | 74 ++++++++++++++++++++++ src/Detector/Tokenizer/Result.php | 101 ++++++++++++++++++++++++++++++ src/autoload.php | 3 + tests/DetectorTest.php | 22 ++++--- 8 files changed, 295 insertions(+), 16 deletions(-) create mode 100644 src/Detector/Tokenizer.php create mode 100644 src/Detector/Tokenizer/PHP.php create mode 100644 src/Detector/Tokenizer/Result.php diff --git a/build/package.xml b/build/package.xml index 8625b03c..00e696fb 100644 --- a/build/package.xml +++ b/build/package.xml @@ -38,10 +38,15 @@ - - + + + + + + + diff --git a/src/CLI/Command.php b/src/CLI/Command.php index a0101a8a..417860cb 100644 --- a/src/CLI/Command.php +++ b/src/CLI/Command.php @@ -45,6 +45,7 @@ use SebastianBergmann\PHPCPD\Detector\Detector; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Tokenizer\PHP; use SebastianBergmann\PHPCPD\Log\PMD; use SebastianBergmann\PHPCPD\Log\Text; use SebastianBergmann\FinderFacade\FinderFacade; @@ -162,9 +163,10 @@ protected function execute(InputInterface $input, OutputInterface $output) $progressHelper->start($output, count($files)); } - $strategy = new DefaultStrategy; - $detector = new Detector($strategy, $progressHelper); - $quiet = $output->getVerbosity() == OutputInterface::VERBOSITY_QUIET; + $tokenizer = new PHP; + $strategy = new DefaultStrategy($tokenizer); + $detector = new Detector($strategy, $progressHelper); + $quiet = $output->getVerbosity() == OutputInterface::VERBOSITY_QUIET; $clones = $detector->copyPasteDetection( $files, diff --git a/src/Detector/Strategy/Default.php b/src/Detector/Strategy/Default.php index ad99c391..c94a1863 100644 --- a/src/Detector/Strategy/Default.php +++ b/src/Detector/Strategy/Default.php @@ -46,6 +46,7 @@ use SebastianBergmann\PHPCPD\CodeClone; use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; +use SebastianBergmann\PHPCPD\Detector\Tokenizer; /** * Default strategy for detecting code clones. @@ -59,6 +60,22 @@ */ class DefaultStrategy extends AbstractStrategy { + /** + * @var Tokenizer + */ + private $tokenizer; + + /** + * Constructor. + * + * @param Tokenizer $tokenizer + * @since Method available since Release 2.0.0 + */ + public function __construct(Tokenizer $tokenizer) + { + $this->tokenizer = $tokenizer; + } + /** * Copy & Paste Detection (CPD). * @@ -71,19 +88,20 @@ class DefaultStrategy extends AbstractStrategy */ public function processFile($file, $minLines, $minTokens, CodeCloneMap $result, $fuzzy = false) { - $buffer = file_get_contents($file); + $tokenizerResult = $this->tokenizer->tokenizeFile($file); + $currentTokenPositions = array(); $currentTokenRealPositions = array(); $currentSignature = ''; - $tokens = token_get_all($buffer); + $tokens = $tokenizerResult->getTokens(); $tokenNr = 0; $lastTokenLine = 0; $result->setNumLines( - $result->getNumLines() + substr_count($buffer, "\n") + $result->getNumLines() + $tokenizerResult->getNumberOfLines() ); - unset($buffer); + unset($tokenizerResult); foreach (array_keys($tokens) as $key) { $token = $tokens[$key]; diff --git a/src/Detector/Tokenizer.php b/src/Detector/Tokenizer.php new file mode 100644 index 00000000..60e494ff --- /dev/null +++ b/src/Detector/Tokenizer.php @@ -0,0 +1,68 @@ +. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of Sebastian Bergmann nor the names of his + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * @package phpcpd + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @since File available since Release 2.0.0 + */ + +namespace SebastianBergmann\PHPCPD\Detector; + +use SebastianBergmann\PHPCPD\Detector\Tokenizer\Result; + +/** + * Interface for a tokenizer, which reads a file and splits it + * into lexical tokens. + * + * @author Johann-Peter Hartmann + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @link http://github.com/sebastianbergmann/phpcpd/tree + * @since Class available since Release 2.0.0 + */ +interface Tokenizer +{ + /** + * Tokenizes a file. + * + * @param string $file + * @return Result + */ + public function tokenizeFile($file); +} diff --git a/src/Detector/Tokenizer/PHP.php b/src/Detector/Tokenizer/PHP.php new file mode 100644 index 00000000..605ced09 --- /dev/null +++ b/src/Detector/Tokenizer/PHP.php @@ -0,0 +1,74 @@ +. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of Sebastian Bergmann nor the names of his + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * @package phpcpd + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @since File available since Release 2.0.0 + */ + +namespace SebastianBergmann\PHPCPD\Detector\Tokenizer; + +use SebastianBergmann\PHPCPD\Detector\Tokenizer; + +/** + * A tokenizer for PHP source files. + * + * @author Johann-Peter Hartmann + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @link http://github.com/sebastianbergmann/phpcpd/tree + * @since Class available since Release 2.0.0 + */ +class PHP implements Tokenizer +{ + /** + * Tokenizes a PHP file. + * + * @param string $file + * @return Result + */ + public function tokenizeFile($file) + { + $buffer = file_get_contents($file); + $tokens = token_get_all($buffer); + $numberOfLines = substr_count($buffer, "\n"); + + return new Result($tokens, $numberOfLines); + } +} diff --git a/src/Detector/Tokenizer/Result.php b/src/Detector/Tokenizer/Result.php new file mode 100644 index 00000000..2ca50aa0 --- /dev/null +++ b/src/Detector/Tokenizer/Result.php @@ -0,0 +1,101 @@ +. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of Sebastian Bergmann nor the names of his + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * @package phpcpd + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @since File available since Release 2.0.0 + */ + +namespace SebastianBergmann\PHPCPD\Detector\Tokenizer; + +/** + * Value object that represents the output of tokenizing a file. + * + * @author Johann-Peter Hartmann + * @author Sebastian Bergmann + * @copyright 2009-2013 Sebastian Bergmann + * @license http://www.opensource.org/licenses/BSD-3-Clause The BSD 3-Clause License + * @link http://github.com/sebastianbergmann/phpcpd/tree + * @since Class available since Release 2.0.0 + */ +class Result +{ + /** + * @var array + */ + private $tokens; + + /** + * @var integer + */ + private $numberOfLines; + + /** + * Constructor. + * + * @param array $tokens Array of tokens, in the same format + * as token_get_all() returns them. + * @param integer $numberOfLines The number of lines in the read file. + */ + public function __construct(array $tokens, $numberOfLines) + { + $this->tokens = $tokens; + $this->numberOfLines = $numberOfLines; + } + + /** + * Returns the tokens that were read from the file. + * + * @return array Array of tokens, in the same format + * as token_get_all() returns them. + */ + public function getTokens() + { + return $this->tokens; + } + + /** + * Returns the number of lines in the file. + * + * @return integer + */ + public function getNumberOfLines() + { + return $this->numberOfLines; + } +} diff --git a/src/autoload.php b/src/autoload.php index c951e6e0..b9d8d4d6 100644 --- a/src/autoload.php +++ b/src/autoload.php @@ -59,6 +59,9 @@ function ($class) { 'sebastianbergmann\\phpcpd\\detector\\detector' => '/Detector/Detector.php', 'sebastianbergmann\\phpcpd\\detector\\strategy\\abstractstrategy' => '/Detector/Strategy/Abstract.php', 'sebastianbergmann\\phpcpd\\detector\\strategy\\defaultstrategy' => '/Detector/Strategy/Default.php', + 'sebastianbergmann\\phpcpd\\detector\\tokenizer' => '/Detector/Tokenizer.php', + 'sebastianbergmann\\phpcpd\\detector\\tokenizer\\php' => '/Detector/Tokenizer/PHP.php', + 'sebastianbergmann\\phpcpd\\detector\\tokenizer\\result' => '/Detector/Tokenizer/Result.php', 'sebastianbergmann\\phpcpd\\log\\abstractxmllogger' => '/Log/AbstractXmlLogger.php', 'sebastianbergmann\\phpcpd\\log\\pmd' => '/Log/PMD.php', 'sebastianbergmann\\phpcpd\\log\\text' => '/Log/Text.php' diff --git a/tests/DetectorTest.php b/tests/DetectorTest.php index 49f62d5a..1b525a2a 100644 --- a/tests/DetectorTest.php +++ b/tests/DetectorTest.php @@ -66,7 +66,7 @@ class PHPCPD_DetectorTest extends PHPUnit_Framework_TestCase */ public function testDetectingSimpleClonesWorks($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array(TEST_FILES_PATH . 'Math.php') @@ -157,7 +157,7 @@ public function testDetectingSimpleClonesWorks($strategy) */ public function testDetectingExactDuplicateFilesWorks($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection(array( TEST_FILES_PATH . 'a.php', @@ -186,7 +186,7 @@ public function testDetectingExactDuplicateFilesWorks($strategy) */ public function testDetectingClonesInMoreThanTwoFiles($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array( @@ -223,7 +223,7 @@ public function testDetectingClonesInMoreThanTwoFiles($strategy) */ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array( @@ -243,7 +243,7 @@ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens($strategy) */ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array( @@ -263,7 +263,7 @@ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines($strategy) */ public function testFuzzyClonesAreFound($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array( @@ -286,7 +286,7 @@ public function testFuzzyClonesAreFound($strategy) */ public function testStripComments($strategy) { - $detector = new SebastianBergmann\PHPCPD\Detector\Detector(new $strategy); + $detector = $this->createDetector($strategy); $clones = $detector->copyPasteDetection( array( TEST_FILES_PATH . 'e.php', @@ -312,6 +312,14 @@ public function testStripComments($strategy) $this->assertCount(1, $clones); } + private function createDetector($strategyClass) + { + $tokenizer = new SebastianBergmann\PHPCPD\Detector\Tokenizer\PHP(); + $strategy = new $strategyClass($tokenizer); + + return new SebastianBergmann\PHPCPD\Detector\Detector($strategy); + } + public function strategyProvider() { return array(