From 24546d30e77a4b5aa0a6a78ce2e3e25520cc3d82 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 13 Sep 2021 17:14:38 +0200 Subject: [PATCH 01/68] make sure this Mbt version doesn't run with new style UTF8 aware datafiles --- src/RunTagger.cxx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 66ca244..eeedef6 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -1137,6 +1137,13 @@ namespace Tagger { SettingsFilePath ); unknowntreeflag = true; // there is a unknowntreefile file specified break; + case 'D': + if ( strncmp( SetBuffer, "DATA_VERSION", 12 ) == 0 ){ + cerr << "Found a DATA_VERSION setting in '" << fname << "'" << endl + << "This version of Mbt doesn't support that!" << endl; + return false; + } + // fall through default: cerr << "Unknown option in settingsfile, (" << SetBuffer << "), ignored." < Date: Tue, 16 Nov 2021 18:54:59 +0100 Subject: [PATCH 02/68] fixing icu namespace issues --- src/GenerateTagger.cxx | 1 + src/MbtAPI.cxx | 1 + src/RunTagger.cxx | 1 + src/Tagger.cxx | 1 + 4 files changed, 4 insertions(+) diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 38d3bec..dd03a1e 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -53,6 +53,7 @@ namespace Tagger { using namespace std; + using namespace icu; using namespace Hash; using namespace Timbl; diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index f6ffc13..32416ff 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -45,6 +45,7 @@ using std::vector; using namespace Tagger; using namespace TiCC; +using namespace icu; MbtAPI::MbtAPI( const std::string& optstring ){ TiCC::CL_Options opts; diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index e27abc4..d828d9c 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -57,6 +57,7 @@ using namespace nlohmann; namespace Tagger { using namespace std; + using namespace icu; using namespace Hash; using namespace Timbl; using TiCC::operator<<; diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 5132548..8851016 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -52,6 +52,7 @@ using namespace TiCC; using namespace std; +using namespace icu; LogStream default_log( cerr ); LogStream default_cout( cout, "", NoStamp); From 0f73f90ff35882d88c189b8724261aa343e3011b Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 19 Nov 2021 10:56:47 +0100 Subject: [PATCH 03/68] bump library version --- src/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile.am b/src/Makefile.am index ac8bdac..71d0a41 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,7 +22,7 @@ mbtg_SOURCES = Mbtg.cxx convert_SOURCES = convert.cxx lib_LTLIBRARIES = libmbt.la -libmbt_la_LDFLAGS= -version-info 1:0:0 +libmbt_la_LDFLAGS= -version-info 2:0:0 libmbt_la_SOURCES = MbtAPI.cxx Pattern.cxx TagLex.cxx Sentence.cxx \ RunTagger.cxx GenerateTagger.cxx Tagger.cxx From 2106fd2939b5f291c9e4574bdd3f4e77d7c54263 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 19 Nov 2021 11:59:00 +0100 Subject: [PATCH 04/68] updating GitHub action --- .github/workflows/mbt.yml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index ce374e9..21ed5e1 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -13,7 +13,7 @@ on: jobs: notification: runs-on: ubuntu-latest - name: Notify start to #gitlama + name: Notify start to gitlama steps: - name: IRC notification uses: Gottox/irc-message-action@v1 @@ -47,13 +47,14 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt-get install libicu-dev libxml2-dev libbz2-dev; - sudo apt-get install zlib1g-dev libtar-dev + sudo apt-get install libicu-dev libxml2-dev libbz2-dev + sudo apt-get install zlib1g-dev libtar-dev cppcheck else - brew install libxml2; - brew install bzip2; - brew install zlib; + brew install libxml2 + brew install bzip2 + brew install zlib brew install libtar + brew install cppcheck fi - name: install TiccUtils env: @@ -83,15 +84,22 @@ jobs: run: sh bootstrap.sh - name: configure env: - CXX: ${{ matrix.compiler }} + CXX: ${{ matrix.compiler }} run: ./configure + - name: compiler-id + id: compiler + run: | + id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) + echo "::set-output name=id::$id" + - name: Static Code-check + run: cppcheck --enable=all --quiet --error-exitcode=0 . - name: make run: make - name: install run: sudo make install - name: make check env: - CXX: ${{ matrix.compiler }} + CXX: ${{ matrix.compiler }} run: LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib make check continue-on-error: true - name: show log @@ -102,7 +110,7 @@ jobs: with: server: irc.uvt.nl channel: '#gitlama' - nickname: GitHub + nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} message: |- ${{ github.event.repository.name }} with ${{ matrix.compiler }} build by ${{ github.actor }} on ${{ matrix.os }}: FAILED - name: Notify IRC of succes @@ -111,6 +119,6 @@ jobs: with: server: irc.uvt.nl channel: '#gitlama' - nickname: GitHub + nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} message: |- ${{ github.event.repository.name }} with ${{ matrix.compiler }} build by ${{ github.actor }} on ${{ matrix.os }}: SUCCESS From 70ddcad6b8fa36a6490b142d1343e39c3c812fe7 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 19 Nov 2021 12:00:27 +0100 Subject: [PATCH 05/68] trigger action on change of this file --- .github/workflows/mbt.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 21ed5e1..ddea987 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -6,6 +6,7 @@ on: paths: - 'src/**' - 'include/**' + - '.github/**' pull_request: branches: [master] From 481e35f05a9290610c5afcef1b70aa5a73d2c6be Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 23 Nov 2021 18:52:34 +0100 Subject: [PATCH 06/68] numb refactoring --- src/Sentence.cxx | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/Sentence.cxx b/src/Sentence.cxx index fabac45..5660763 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -279,9 +279,8 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( addChars ); + Pat[i_feature++] = TheLex.hash( addChars ); } - i_feature++; } } @@ -321,9 +320,8 @@ namespace Tagger { else { // Out of context. #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( DOT ); + Pat[i_feature++] = TheLex.hash( DOT ); } - i_feature++; } } // i @@ -354,23 +352,20 @@ namespace Tagger { i_feature++; break; case 'f': - Pat[i_feature] = wPtr->word_amb_tag; - i_feature++; + Pat[i_feature++] = wPtr->word_amb_tag; break; case 'F': break; case 'a': - Pat[i_feature] = wPtr->word_amb_tag; - i_feature++; + Pat[i_feature++] = wPtr->word_amb_tag; break; } } else { // Out of context. #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( DOT ); + Pat[i_feature++] = TheLex.hash( DOT ); } - i_feature++; } } // i @@ -387,9 +382,8 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( addChars ); + Pat[i_feature++] = TheLex.hash( addChars ); } - i_feature++; } } @@ -405,9 +399,8 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( addChars ); + Pat[i_feature++] = TheLex.hash( addChars ); } - i_feature++; } // Capital (First Letter)? @@ -422,9 +415,8 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( addChars ); + Pat[i_feature++] = TheLex.hash( addChars ); } - i_feature++; } // Numeric (somewhere in word)? @@ -441,7 +433,6 @@ namespace Tagger { { Pat[i_feature] = TheLex.hash( addChars ); } - i_feature++; } // cerr << "next_pat: i_feature = " << i_feature << endl; // for ( int bla = 0; bla < i_feature; bla++ ) From 4aa5ea9ab9ac6bf55572bffc37e9e1d1965a0315 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 23 Nov 2021 19:15:12 +0100 Subject: [PATCH 07/68] some modernizing --- src/GenerateTagger.cxx | 9 ++------- src/Sentence.cxx | 35 +++++++++++++++-------------------- 2 files changed, 17 insertions(+), 27 deletions(-) diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index dd03a1e..d8b89a3 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -221,21 +221,16 @@ namespace Tagger { int TaggerClass::makedataset( istream& infile, bool do_known ){ int no_words=0; - int nslots=0; ofstream outfile; MatchAction Action; vector TestPat; if ( do_known ){ - nslots = Ktemplate.totalslots() - Ktemplate.skipfocus; outfile.open( K_option_name, ios::trunc | ios::out ); Action = MakeKnown; - TestPat.reserve(Ktemplate.totalslots()); } else { - nslots = Utemplate.totalslots() - Utemplate.skipfocus; outfile.open( U_option_name, ios::trunc | ios::out ); Action = MakeUnknown; - TestPat.reserve(Utemplate.totalslots()); } // loop as long as you get sentences // @@ -276,8 +271,8 @@ namespace Tagger { } } if ( !skip ){ - for ( int f=0; f < nslots; ++f ){ - outfile << indexlex( TestPat[f], TheLex ) << " "; + for ( const auto& pat: TestPat ){ + outfile << indexlex( pat, TheLex ) << " "; } } int thisTagCode = -1; diff --git a/src/Sentence.cxx b/src/Sentence.cxx index 5660763..3f8a389 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -234,6 +234,7 @@ namespace Tagger { bool sentence::nextpat( MatchAction& Action, vector& Pat, UnicodeHash& wordlist, UnicodeHash& TheLex, unsigned int position, int *old_pat ) const { + Pat.clear(); // safety check: // if ( no_words == 0 || position >= no_words ){ @@ -241,7 +242,6 @@ namespace Tagger { } word *current_word = Words[position]; size_t CurWLen = current_word->the_word.length(); - int i_feature=0; const PatTemplate *aTemplate; word* wPtr; unsigned int tok; @@ -279,7 +279,7 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( addChars ); + Pat.push_back( TheLex.hash( addChars ) ); } } } @@ -301,26 +301,25 @@ namespace Tagger { switch(aTemplate->word_templatestring[i]) { case 'w': if ( wordlist.num_of_entries() == 0 ){ - Pat[i_feature] = wPtr->the_word_index; + Pat.push_back( wPtr->the_word_index ); } else { tok = wordlist.lookup( wPtr->the_word ); //cerr << "known word Lookup(" << wPtr->the_word << ") gave " << tok << endl; if ( tok ){ - Pat[i_feature] = wPtr->the_word_index; + Pat.push_back( wPtr->the_word_index ); } else { - Pat[i_feature] = classify_hapax( wPtr->the_word, TheLex ); + Pat.push_back( classify_hapax( wPtr->the_word, TheLex ) ); } } - i_feature++; break; } } else { // Out of context. #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( DOT ); + Pat.push_back( TheLex.hash( DOT ) ); } } } // i @@ -342,29 +341,28 @@ namespace Tagger { switch(aTemplate->templatestring[ii]){ case 'd': if ( old_pat == 0 ){ - Pat[i_feature] = wPtr->word_ass_tag; + Pat.push_back( wPtr->word_ass_tag ); } else { // cerr << "bekijk old pat = " << position+ii-aTemplate->focuspos // << " - " << old_pat[position+ii-aTemplate->focuspos] << endl; - Pat[i_feature] = old_pat[position+ii-aTemplate->focuspos]; + Pat.push_back( old_pat[position+ii-aTemplate->focuspos] ); } - i_feature++; break; case 'f': - Pat[i_feature++] = wPtr->word_amb_tag; + Pat.push_back( wPtr->word_amb_tag ); break; case 'F': break; case 'a': - Pat[i_feature++] = wPtr->word_amb_tag; + Pat.push_back( wPtr->word_amb_tag ); break; } } else { // Out of context. #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( DOT ); + Pat.push_back( TheLex.hash( DOT ) ); } } } // i @@ -382,7 +380,7 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( addChars ); + Pat.push_back( TheLex.hash( addChars ) ); } } } @@ -399,7 +397,7 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( addChars ); + Pat.push_back( TheLex.hash( addChars ) ); } } @@ -415,7 +413,7 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature++] = TheLex.hash( addChars ); + Pat.push_back( TheLex.hash( addChars ) ); } } @@ -431,12 +429,9 @@ namespace Tagger { } #pragma omp critical (hasher) { - Pat[i_feature] = TheLex.hash( addChars ); + Pat.push_back( TheLex.hash( addChars ) ); } } - // cerr << "next_pat: i_feature = " << i_feature << endl; - // for ( int bla = 0; bla < i_feature; bla++ ) - // cerr << bla << " - " << Pat[bla] << endl; return true; } From 5993e9247cf4dbc04e92d5ce20ee9d45d29e6824 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 15 Dec 2021 13:08:53 +0100 Subject: [PATCH 08/68] NEWS about upcoming release --- NEWS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/NEWS b/NEWS index bfaf671..b0435fb 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,11 @@ +mbt version 3.7 2020-04-15 +[Ko vd Sloot] +* use NFC vormalized UnicodeString interally, making Mbt robust against UTF8 + encoded files even for more exotic languages the Dutch. +* Uses the newest UTF8 aware Timbl +* bumped library version +* some code refactoring + mbt version 3.6 2020-04-15 [Ko vd Sloot] * replaced uses of Lexicon class by a normal std::map From a980a1ffa7fac5da55e509cd4b11da68c963ffa7 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 15 Dec 2021 13:09:56 +0100 Subject: [PATCH 09/68] wrong date fixed --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index b0435fb..17cbe9c 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -mbt version 3.7 2020-04-15 +mbt version 3.7 2020-12-15 [Ko vd Sloot] * use NFC vormalized UnicodeString interally, making Mbt robust against UTF8 encoded files even for more exotic languages the Dutch. From 73056f294ebe1f7223236a414d7d81f4eda312c5 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 16 Dec 2021 12:34:22 +0100 Subject: [PATCH 10/68] bumped version after release --- codemeta.json | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codemeta.json b/codemeta.json index e6840a3..17dc312 100644 --- a/codemeta.json +++ b/codemeta.json @@ -6,7 +6,7 @@ "@type": "SoftwareSourceCode", "identifier": "mbt", "name": "mbt", - "version": "3.7", + "version": "3.8", "description": "MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech. ", "license": "https://spdx.org/licenses/GPL-3.0", "url": "https://languagemachines.github.io/mbt", diff --git a/configure.ac b/configure.ac index ef43d16..8ffc68c 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.61) -AC_INIT([mbt],[3.7],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! +AC_INIT([mbt],[3.8],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) From f554e04f938dd324d726b71d4c45154a15c670c0 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 31 Dec 2021 10:45:47 +0100 Subject: [PATCH 11/68] entering 2022 --- include/mbt/Logging.h | 2 +- include/mbt/MbtAPI.h | 2 +- include/mbt/Pattern.h | 2 +- include/mbt/Sentence.h | 2 +- include/mbt/TagLex.h | 2 +- include/mbt/Tagger.h | 2 +- src/GenerateTagger.cxx | 2 +- src/Mbt.cxx | 2 +- src/MbtAPI.cxx | 2 +- src/Mbtg.cxx | 2 +- src/Pattern.cxx | 2 +- src/RunTagger.cxx | 5 +++-- src/Sentence.cxx | 2 +- src/TagLex.cxx | 2 +- src/Tagger.cxx | 2 +- src/convert.cxx | 2 +- src/simpletest.cxx | 2 +- 17 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/mbt/Logging.h b/include/mbt/Logging.h index 06ed448..5b70152 100644 --- a/include/mbt/Logging.h +++ b/include/mbt/Logging.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/MbtAPI.h b/include/mbt/MbtAPI.h index f9eba60..7a6e2cf 100644 --- a/include/mbt/MbtAPI.h +++ b/include/mbt/MbtAPI.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Pattern.h b/include/mbt/Pattern.h index 2bef435..0f49b06 100644 --- a/include/mbt/Pattern.h +++ b/include/mbt/Pattern.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Sentence.h b/include/mbt/Sentence.h index 5cc909b..2073933 100644 --- a/include/mbt/Sentence.h +++ b/include/mbt/Sentence.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/TagLex.h b/include/mbt/TagLex.h index 35642a3..7dacb60 100644 --- a/include/mbt/TagLex.h +++ b/include/mbt/TagLex.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index bbc5cdb..0b9d6d6 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index d8b89a3..bec4fc7 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbt.cxx b/src/Mbt.cxx index 26deea7..e4550c8 100644 --- a/src/Mbt.cxx +++ b/src/Mbt.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index 32416ff..990b510 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbtg.cxx b/src/Mbtg.cxx index fefe539..028002e 100644 --- a/src/Mbtg.cxx +++ b/src/Mbtg.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Pattern.cxx b/src/Pattern.cxx index b3d99d8..cc92e49 100644 --- a/src/Pattern.cxx +++ b/src/Pattern.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index d828d9c..e057b0e 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp @@ -1309,7 +1309,8 @@ namespace Tagger { void TaggerClass::manifest( const string& prog ){ // present yourself to the user // - cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2021." << endl + cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2022." + << endl << "Memory Based Tagger " << endl << "CLST - Centre for Language and Speech Technology," << "Radboud University" << endl diff --git a/src/Sentence.cxx b/src/Sentence.cxx index 3f8a389..652be7f 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/TagLex.cxx b/src/TagLex.cxx index 557fd86..ced23d2 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 8851016..c660ef1 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/convert.cxx b/src/convert.cxx index c0671b4..5958a95 100644 --- a/src/convert.cxx +++ b/src/convert.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/simpletest.cxx b/src/simpletest.cxx index 924d63a..afd8fea 100644 --- a/src/simpletest.cxx +++ b/src/simpletest.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2021 + Copyright (c) 1998 - 2022 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp From edde8213ff84738fd6907b5eab11fd91ed6e7cd8 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Fri, 11 Feb 2022 21:27:48 +0100 Subject: [PATCH 12/68] Added a Dockerfile + instructions (not using LaMachine) --- .dockerignore | 13 +++++++++++++ Dockerfile | 27 +++++++++++++++++++++++++++ README.md | 3 +++ 3 files changed, 43 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..5f7baae --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +.git +.cache +.* +_* +*.cache +*.pyc +build +*.egg-info +gource* +*.tar.gz +*.pdf +TODO +*.lock diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ba58e11 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +#TODO: revert channel to 'latest' instead of 'edge' once 3.16 is released +FROM alpine:edge +#VERSION can be "stable" or "development" +ARG VERSION="stable" +LABEL org.opencontainers.image.authors="Maarten van Gompel " +LABEL description="mbt - Memory based Tagger" + +RUN mkdir -p /data +RUN mkdir -p /usr/src/mbt +COPY . /usr/src/mbt + +RUN if [ "$VERSION" = "stable" ]; then \ + rm -Rf /usr/src/mbt &&\ + echo -e "----------------------------------------------------------\nNOTE: Installing latest stable release as provided by Alpine package manager.\nThis version may diverge from the one in the git master tree!\nFor development, build with --build-arg VERSION=development.\n----------------------------------------------------------\n" &&\ + apk update && apk add mbt; \ + else \ + echo -e "----------------------------------------------------------\nNOTE: Building development versions from source.\nThis version may be experimental and contains bugs!\nFor production, build with --build-arg VERSION=stable ----------------------------------------------------------\n" &&\ + apk add build-base autoconf-archive autoconf automake libtool libtar-dev libbz2 bzip2-dev icu-dev libxml2-dev libexttextcat-dev git &&\ + cd /usr/src/ &&\ + git clone https://github.com/LanguageMachines/ticcutils && cd ticcutils && sh ./bootstrap.sh && ./configure && make && make install && cd .. &&\ + git clone https://github.com/LanguageMachines/timbl && cd timbl && sh ./bootstrap.sh && ./configure && make && make install && cd .. &&\ + cd mbt && sh bootstrap.sh && ./configure && make && make install; \ + fi + +WORKDIR / + +ENTRYPOINT [ "mbt" ] diff --git a/README.md b/README.md index 5a9f07e..fbcf357 100644 --- a/README.md +++ b/README.md @@ -54,3 +54,6 @@ To compile and install manually from source instead, provided you have all the d $ ./configure $ make $ make install + +A `Dockerfile` for a container build is also available, specify `--build-arg VERSION=development` if you want the latest +development version rather than the latest stable release as shipped with Alpine Linux. From ae3e0d82d1a15712f8d11616dd8b0201880da8b8 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 21 Jul 2022 20:43:20 +0200 Subject: [PATCH 13/68] codemeta.json: updated metadata according to new (proposed) CLARIAH requirements (CLARIAH/clariah-plus#38) --- codemeta.json | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/codemeta.json b/codemeta.json index 17dc312..8e665b6 100644 --- a/codemeta.json +++ b/codemeta.json @@ -10,6 +10,22 @@ "description": "MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech. ", "license": "https://spdx.org/licenses/GPL-3.0", "url": "https://languagemachines.github.io/mbt", + "producer": { + "@id": "https://huc.knaw.nl", + "@type": "Organization", + "name": "KNAW Humanities Cluster", + "url": "https://huc.knaw.nl", + "parentOrganization": { + "@id": "https://knaw.nl", + "@type": "Organization", + "name": "KNAW", + "url": "https://knaw.nl", + "location": { + "@type": "Place", + "name": "Amsterdam" + } + } + }, "author": [ { "@type": "Person", @@ -45,8 +61,7 @@ "@type": "Person", "givenName": "Antal", "familyName": "van den Bosch", - "email": "antal.vandenbosch@let.ru.nl", - "affiliation": { "@id": "https://cls.ru.nl" } + "email": "antal.vandenbosch@let.ru.nl" }, { "@type": "Person", @@ -59,13 +74,12 @@ "familyName": "Zavrel" } ], - "sourceOrganization": { "@id": "https://www.ru.nl/clst" }, "programmingLanguage": { "@type": "ComputerLanguage", "identifier": "c++", "name": "C++" }, - "operatingSystem": "POSIX", + "operatingSystem": [ "Linux", "BSD", "macOS"], "codeRepository": "https://github.com/LanguageMachines/mbt", "softwareRequirements": [ { @@ -98,5 +112,21 @@ "url": "http://ilk.uvt.nl/mblp", "publisher": "Cambridge University Press" } - ] + ], + "targetProduct": [ + { + "@type": "SoftwareLibrary", + "executableName": "libmbt", + "name": "libmbt", + "runtimePlatform": [ "Linux", "BSD", "macOS" ], + "description": "Memory-based Tagging Library with API for C++" + }, + { + "@type": "CommandLineApplication", + "executableName": "mbt", + "name": "mbt", + "runtimePlatform": [ "Linux", "BSD", "macOS" ], + "description": "Command-line interface to the full NLP suite" + } + ] } From 9952ebe6035973c5c744e410cb62d8898c4dc7be Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 21 Jul 2022 20:47:45 +0200 Subject: [PATCH 14/68] Updated Dockerfile, using build-deps.sh --- Dockerfile | 24 +++++++++++++----------- Makefile.am | 9 +++++++++ README.md | 10 +++++++--- build-deps.sh | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 14 deletions(-) create mode 100755 build-deps.sh diff --git a/Dockerfile b/Dockerfile index ba58e11..203953a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ -#TODO: revert channel to 'latest' instead of 'edge' once 3.16 is released -FROM alpine:edge -#VERSION can be "stable" or "development" +FROM alpine:latest +#VERSION can be: +# - stable: builds latest stable versions from source (default) +# - distro: uses packages as provided by Alpine Linux (may be slightly out of date) +# - devel: latest development version (git master/main branch) ARG VERSION="stable" LABEL org.opencontainers.image.authors="Maarten van Gompel " LABEL description="mbt - Memory based Tagger" @@ -9,17 +11,17 @@ RUN mkdir -p /data RUN mkdir -p /usr/src/mbt COPY . /usr/src/mbt -RUN if [ "$VERSION" = "stable" ]; then \ +RUN if [ "$VERSION" = "distro" ]; then \ rm -Rf /usr/src/mbt &&\ - echo -e "----------------------------------------------------------\nNOTE: Installing latest stable release as provided by Alpine package manager.\nThis version may diverge from the one in the git master tree!\nFor development, build with --build-arg VERSION=development.\n----------------------------------------------------------\n" &&\ + echo -e "----------------------------------------------------------\nNOTE: Installing latest release as provided by Alpine package manager.\nThis version may diverge from the one in the git master tree or even from the latest release on github!\nFor development, build with --build-arg VERSION=development.\n----------------------------------------------------------\n" &&\ apk update && apk add mbt; \ else \ - echo -e "----------------------------------------------------------\nNOTE: Building development versions from source.\nThis version may be experimental and contains bugs!\nFor production, build with --build-arg VERSION=stable ----------------------------------------------------------\n" &&\ - apk add build-base autoconf-archive autoconf automake libtool libtar-dev libbz2 bzip2-dev icu-dev libxml2-dev libexttextcat-dev git &&\ - cd /usr/src/ &&\ - git clone https://github.com/LanguageMachines/ticcutils && cd ticcutils && sh ./bootstrap.sh && ./configure && make && make install && cd .. &&\ - git clone https://github.com/LanguageMachines/timbl && cd timbl && sh ./bootstrap.sh && ./configure && make && make install && cd .. &&\ - cd mbt && sh bootstrap.sh && ./configure && make && make install; \ + PACKAGES="libtar libbz2 icu-libs libxml2 libexttextcat libgomp libstdc++" &&\ + BUILD_PACKAGES="build-base autoconf-archive autoconf automake libtool libtar-dev bzip2-dev icu-dev libxml2-dev git" &&\ + apk add $PACKAGES $BUILD_PACKAGES &&\ + cd /usr/src/ && ./mbt/build-deps.sh &&\ + cd mbt && sh ./bootstrap.sh && ./configure && make && make install &&\ + apk del $BUILD_PACKAGES && rm -Rf /usr/src; \ fi WORKDIR / diff --git a/Makefile.am b/Makefile.am index 70681a2..c930bac 100644 --- a/Makefile.am +++ b/Makefile.am @@ -12,3 +12,12 @@ pkgconfig_DATA = mbt.pc ChangeLog: NEWS git pull; git2cl > ChangeLog + +docker: + docker build -t mbt:latest . + +docker-dev: + docker build -t mbt:dev --build-arg VERSION=development . + +deps: + ./build-deps.sh diff --git a/README.md b/README.md index fbcf357..dd8c272 100644 --- a/README.md +++ b/README.md @@ -45,15 +45,19 @@ following pakages: To install Mbt, first consult whether your distribution's package manager has an up-to-date package. -If not, for easy installation of Mbt, TiMBL, and all dependencies, it is included as part of our software -distribution LaMachine: https://proycon.github.io/LaMachine . -To compile and install manually from source instead, provided you have all the dependencies installed: +To compile and install manually from source instead: $ bash bootstrap.sh $ ./configure $ make $ make install +This requires you have all the necessary dependencies. If you want to +automatically download and install the latest stable versions of the required +dependencies, then run `./build-deps.sh` prior to the above. You can pass a +target directory prefix as first argument and you may need to prepend `sudo` to +ensure you can install there. + A `Dockerfile` for a container build is also available, specify `--build-arg VERSION=development` if you want the latest development version rather than the latest stable release as shipped with Alpine Linux. diff --git a/build-deps.sh b/build-deps.sh new file mode 100755 index 0000000..13efa37 --- /dev/null +++ b/build-deps.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +# Builds necessary dependencies from source + +set -e + +[ -z "$VERSION" ] && VERSION=stable +[ -z "$PREFIX" ] && [ -n "$1" ] && PREFIX=$1 +[ -z "$PREFIX" ] && PREFIX=/usr/local + +if [ "$VERSION" = "stable" ]; then + echo "------------------------------------------------------------------------">&2 + echo " Building latest stable release of main dependencies from source.">&2 + echo "------------------------------------------------------------------------">&2 +else + echo "------------------------------------------------------------------------">&2 + echo " Building development versions of main dependencie from source.">&2 + echo " (This is experimental and may contain bugs! DO NOT PUBLISH!)">&2 + echo "-----------------------------------------------------------------------">&2 +fi + +PWD="$(pwd)" +BUILDDIR="$(mktemp -dt "build-deps.XXXXXX")" +cd "$BUILDDIR" +BUILD_SOURCES="LanguageMachines/ticcutils LanguageMachines/timbl" +for SUFFIX in $BUILD_SOURCES; do \ + NAME="$(basename "$SUFFIX")" + git clone "https://github.com/$SUFFIX" + cd "$NAME" + REF=$(git tag -l | grep -E "^v?[0-9]+(\.[0-9])*" | sort -t. -k 1.2,1n -k 2,2n -k 3,3n -k 4,4n | tail -n 1) + if [ "$VERSION" = "stable" ] && [ -n "$REF" ]; then + git -c advice.detachedHead=false checkout "$REF" + fi + sh ./bootstrap.sh && ./configure --prefix "$PREFIX" && make && make install + cd .. +done +cd "$PWD" +[ -n "$BUILDDIR" ] && rm -Rf "$BUILDDIR" From 9ce7f042e731ec1a5ebb943380af19ef83fb3516 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 21 Jul 2022 20:51:55 +0200 Subject: [PATCH 15/68] codemeta.json: update (missing context) --- codemeta.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codemeta.json b/codemeta.json index 8e665b6..28b08c9 100644 --- a/codemeta.json +++ b/codemeta.json @@ -1,7 +1,8 @@ { "@context": [ "https://doi.org/10.5063/schema/codemeta-2.0", - "http://schema.org" + "http://schema.org", + "https://w3id.org/software-types" ], "@type": "SoftwareSourceCode", "identifier": "mbt", From 5a98e34ba18d34c933c1985d131c811ed9b008a7 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 21 Jul 2022 20:53:46 +0200 Subject: [PATCH 16/68] codemeta.json: fixed wrong description --- codemeta.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codemeta.json b/codemeta.json index 28b08c9..87205ee 100644 --- a/codemeta.json +++ b/codemeta.json @@ -127,7 +127,7 @@ "executableName": "mbt", "name": "mbt", "runtimePlatform": [ "Linux", "BSD", "macOS" ], - "description": "Command-line interface to the full NLP suite" + "description": "Memory-based tagger, command-line tool" } ] } From 92f6ea41c706b6a20751f1dae9f263bf6ebb67bf Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Thu, 21 Jul 2022 20:57:24 +0200 Subject: [PATCH 17/68] codemeta.json: fixed developmentStatus --- codemeta.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codemeta.json b/codemeta.json index 87205ee..9b31ef8 100644 --- a/codemeta.json +++ b/codemeta.json @@ -103,7 +103,7 @@ "issueTracker": "https://github.com/LanguageMachines/mbt/issues", "contIntegration": "https://travis-ci.org/LanguageMachines/mbt", "releaseNotes": "https://github.com/LanguageMachines/mbt/releases", - "developmentStatus": "active", + "developmentStatus": "https://www.repostatus.org/#active", "keywords": [ "nlp", "natural language processing", "memory based learning", "tagger", "machine learning" ], "referencePublication": [ { From fe1719ce6cb55368979215926500ed30477d78e9 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Fri, 22 Jul 2022 00:18:43 +0200 Subject: [PATCH 18/68] NEWS: updated in preparation for release --- NEWS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS b/NEWS index 17cbe9c..32cc2a1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +mbt version 3.8 2022-07-22 +[Maarten van Gompel] +* updated metadata (codemeta.json) following new (proposed) CLARIAH requirements (CLARIAH/clariah-plus#38) +* added builds-deps.sh for automatically building and installing dependencies +* added Dockerfile and instructions +* no functional changes + mbt version 3.7 2020-12-15 [Ko vd Sloot] * use NFC vormalized UnicodeString interally, making Mbt robust against UTF8 From 0d5fc662a9143bbcd22da4ff25635e36a21aecbd Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Wed, 24 Aug 2022 22:09:50 +0200 Subject: [PATCH 19/68] MAINTAINERS: added --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 MAINTAINERS diff --git a/MAINTAINERS b/MAINTAINERS new file mode 100644 index 0000000..f464cfc --- /dev/null +++ b/MAINTAINERS @@ -0,0 +1,2 @@ +Maarten van Gompel (KNAW Humanities Cluster) +Ko van der Sloot From b5196ff4986d75bf9e646e0cda4d90ca48a4e05e Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 26 Aug 2022 08:25:13 +0200 Subject: [PATCH 20/68] modernizing --- configure.ac | 5 ++--- src/RunTagger.cxx | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index 8ffc68c..a3f323e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,13 +1,13 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. -AC_PREREQ(2.61) +AC_PREREQ([2.69]) AC_INIT([mbt],[3.8],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) AC_CONFIG_MACRO_DIR([m4]) -AC_CONFIG_HEADER([config.h]) +AC_CONFIG_HEADERS([config.h]) if test x"${CXXFLAGS+set}" = xset; then # the user set CXXFLAGS; don't override it. @@ -22,7 +22,6 @@ fi # Checks for programs. AC_PROG_CXX( [g++] ) -AC_PROG_LIBTOOL LT_INIT # when running tests, use CXX diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index e057b0e..6cdf651 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -97,7 +97,6 @@ namespace Tagger { (paths = new int*[Size]) == 0 || (temppaths = new int*[Size]) == 0 ){ throw runtime_error( "Beam: not enough memory for N-best search tables" ); - return false; } else { for ( int q=0; q < Size; ++q ){ @@ -105,7 +104,6 @@ namespace Tagger { temppaths[q] = 0; if ( (n_best_array[q] = new n_best_tuple) == 0 ){ throw runtime_error( "Beam: not enough memory for N-best search tables" ); - return false; } } } @@ -120,7 +118,6 @@ namespace Tagger { if ( (paths[q] = new int[noWords]) == 0 || (temppaths[q] = new int[noWords]) == 0 ){ throw runtime_error( "Beam: not enough memory for N-best search tables" ); - return false; } } size = Size; From 14795232d3f506775aaeef9606cf20a71e853f70 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 4 Oct 2022 13:24:03 +0200 Subject: [PATCH 21/68] change permissions --- bootstrap.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bootstrap.sh diff --git a/bootstrap.sh b/bootstrap.sh old mode 100644 new mode 100755 From 8e166a08a6ea66bf6435e7f7bef546d6e065a58b Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 2 Nov 2022 11:29:20 +0100 Subject: [PATCH 22/68] rely on ax_pthread.m4 from the autoconf archive --- configure.ac | 4 +- m4/acx_pthread.m4 | 271 ---------------------------------------------- 2 files changed, 2 insertions(+), 273 deletions(-) delete mode 100644 m4/acx_pthread.m4 diff --git a/configure.ac b/configure.ac index a3f323e..ba5d1aa 100644 --- a/configure.ac +++ b/configure.ac @@ -38,8 +38,8 @@ AC_TYPE_SIZE_T # Checks for library functions. # check for pthreads -ACX_PTHREAD([],[AC_MSG_NOTICE([no pthread support found])]) -if test x"$acx_pthread_ok" = xyes; then +AX_PTHREAD([],[AC_MSG_NOTICE([no pthread support found])]) +if test x"$ax_pthread_ok" = xyes; then LIBS="$PTHREAD_LIBS $LIBS" \ CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" fi diff --git a/m4/acx_pthread.m4 b/m4/acx_pthread.m4 deleted file mode 100644 index 44d0085..0000000 --- a/m4/acx_pthread.m4 +++ /dev/null @@ -1,271 +0,0 @@ -# =========================================================================== -# http://autoconf-archive.cryp.to/acx_pthread.html -# =========================================================================== -# -# SYNOPSIS -# -# ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) -# -# DESCRIPTION -# -# This macro figures out how to build C programs using POSIX threads. It -# sets the PTHREAD_LIBS output variable to the threads library and linker -# flags, and the PTHREAD_CFLAGS output variable to any special C compiler -# flags that are needed. (The user can also force certain compiler -# flags/libs to be tested by setting these environment variables.) -# -# Also sets PTHREAD_CC to any special C compiler that is needed for -# multi-threaded programs (defaults to the value of CC otherwise). (This -# is necessary on AIX to use the special cc_r compiler alias.) -# -# NOTE: You are assumed to not only compile your program with these flags, -# but also link it with them as well. e.g. you should link with -# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS -# -# If you are only building threads programs, you may wish to use these -# variables in your default LIBS, CFLAGS, and CC: -# -# LIBS="$PTHREAD_LIBS $LIBS" -# CFLAGS="$CFLAGS $PTHREAD_CFLAGS" -# CC="$PTHREAD_CC" -# -# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant -# has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name -# (e.g. PTHREAD_CREATE_UNDETACHED on AIX). -# -# ACTION-IF-FOUND is a list of shell commands to run if a threads library -# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it -# is not found. If ACTION-IF-FOUND is not specified, the default action -# will define HAVE_PTHREAD. -# -# Please let the authors know if this macro fails on any platform, or if -# you have any other suggestions or comments. This macro was based on work -# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help -# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by -# Alejandro Forero Cuervo to the autoconf macro repository. We are also -# grateful for the helpful feedback of numerous users. -# -# LICENSE -# -# Copyright (c) 2008 Steven G. Johnson -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. If not, see . -# -# As a special exception, the respective Autoconf Macro's copyright owner -# gives unlimited permission to copy, distribute and modify the configure -# scripts that are the output of Autoconf when processing the Macro. You -# need not follow the terms of the GNU General Public License when using -# or distributing such scripts, even though portions of the text of the -# Macro appear in them. The GNU General Public License (GPL) does govern -# all other use of the material that constitutes the Autoconf Macro. -# -# This special exception to the GPL applies to versions of the Autoconf -# Macro released by the Autoconf Archive. When you make and distribute a -# modified version of the Autoconf Macro, you may extend this special -# exception to the GPL to apply to your modified version as well. - -AC_DEFUN([ACX_PTHREAD], [ -AC_REQUIRE([AC_CANONICAL_HOST]) -AC_LANG_SAVE -AC_LANG([C++]) -acx_pthread_ok=no - -# We used to check for pthread.h first, but this fails if pthread.h -# requires special compiler flags (e.g. on True64 or Sequent). -# It gets checked for in the link test anyway. - -# First of all, check if the user has set any of the PTHREAD_LIBS, -# etcetera environment variables, and if threads linking works using -# them: -if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then - save_CFLAGS="$CFLAGS" - CFLAGS="$CFLAGS $PTHREAD_CFLAGS" - save_LIBS="$LIBS" - LIBS="$PTHREAD_LIBS $LIBS" - AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) - AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes) - AC_MSG_RESULT($acx_pthread_ok) - if test x"$acx_pthread_ok" = xno; then - PTHREAD_LIBS="" - PTHREAD_CFLAGS="" - fi - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" -fi - -# We must check for the threads library under a number of different -# names; the ordering is very important because some systems -# (e.g. DEC) have both -lpthread and -lpthreads, where one of the -# libraries is broken (non-POSIX). - -# Create a list of thread flags to try. Items starting with a "-" are -# C compiler flags, and other items are library names, except for "none" -# which indicates that we try without any flags at all, and "pthread-config" -# which is a program returning the flags for the Pth emulation library. - -acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" - -# The ordering *is* (sometimes) important. Some notes on the -# individual items follow: - -# pthreads: AIX (must check this before -lpthread) -# none: in case threads are in libc; should be tried before -Kthread and -# other compiler flags to prevent continual compiler warnings -# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) -# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) -# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) -# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) -# -pthreads: Solaris/gcc -# -mthreads: Mingw32/gcc, Lynx/gcc -# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it -# doesn't hurt to check since this sometimes defines pthreads too; -# also defines -D_REENTRANT) -# ... -mt is also the pthreads flag for HP/aCC -# pthread: Linux, etcetera -# --thread-safe: KAI C++ -# pthread-config: use pthread-config program (for GNU Pth library) - -case "${host_cpu}-${host_os}" in - *solaris*) - - # On Solaris (at least, for some versions), libc contains stubbed - # (non-functional) versions of the pthreads routines, so link-based - # tests will erroneously succeed. (We need to link with -pthreads/-mt/ - # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather - # a function called by this macro, so we could check for that, but - # who knows whether they'll stub that too in a future libc.) So, - # we'll just look for -pthreads and -lpthread first: - - acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags" - ;; -esac - -if test x"$acx_pthread_ok" = xno; then -for flag in $acx_pthread_flags; do - - case $flag in - none) - AC_MSG_CHECKING([whether pthreads work without any flags]) - ;; - - -*) - AC_MSG_CHECKING([whether pthreads work with $flag]) - PTHREAD_CFLAGS="$flag" - ;; - - pthread-config) - AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no) - if test x"$acx_pthread_config" = xno; then continue; fi - PTHREAD_CFLAGS="`pthread-config --cflags`" - PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" - ;; - - *) - AC_MSG_CHECKING([for the pthreads library -l$flag]) - PTHREAD_LIBS="-l$flag" - ;; - esac - - save_LIBS="$LIBS" - save_CFLAGS="$CFLAGS" - LIBS="$PTHREAD_LIBS $LIBS" - CFLAGS="$CFLAGS $PTHREAD_CFLAGS" - - # Check for various functions. We must include pthread.h, - # since some functions may be macros. (On the Sequent, we - # need a special flag -Kthread to make this header compile.) - # We check for pthread_join because it is in -lpthread on IRIX - # while pthread_create is in libc. We check for pthread_attr_init - # due to DEC craziness with -lpthreads. We check for - # pthread_cleanup_push because it is one of the few pthread - # functions on Solaris that doesn't have a non-functional libc stub. - # We try pthread_create on general principles. - AC_TRY_LINK([#include ], - [pthread_t th; pthread_join(th, 0); - pthread_attr_init(0); pthread_cleanup_push(0, 0); - pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], - [acx_pthread_ok=yes]) - - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" - - AC_MSG_RESULT($acx_pthread_ok) - if test "x$acx_pthread_ok" = xyes; then - break; - fi - - PTHREAD_LIBS="" - PTHREAD_CFLAGS="" -done -fi - -# Various other checks: -if test "x$acx_pthread_ok" = xyes; then - save_LIBS="$LIBS" - LIBS="$PTHREAD_LIBS $LIBS" - save_CFLAGS="$CFLAGS" - CFLAGS="$CFLAGS $PTHREAD_CFLAGS" - - # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. - AC_MSG_CHECKING([for joinable pthread attribute]) - attr_name=unknown - for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do - AC_TRY_LINK([#include ], [int attr=$attr; return attr;], - [attr_name=$attr; break]) - done - AC_MSG_RESULT($attr_name) - if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then - AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name, - [Define to necessary symbol if this constant - uses a non-standard name on your system.]) - fi - - AC_MSG_CHECKING([if more special flags are required for pthreads]) - flag=no - case "${host_cpu}-${host_os}" in - *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";; - *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";; - esac - AC_MSG_RESULT(${flag}) - if test "x$flag" != xno; then - PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" - fi - - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" - - # More AIX lossage: must compile with xlc_r or cc_r - if test x"$GCC" != xyes; then - AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC}) - else - PTHREAD_CC=$CC - fi -else - PTHREAD_CC="$CC" -fi - -AC_SUBST(PTHREAD_LIBS) -AC_SUBST(PTHREAD_CFLAGS) -AC_SUBST(PTHREAD_CC) - -# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: -if test x"$acx_pthread_ok" = xyes; then - ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1]) - : -else - acx_pthread_ok=no - $2 -fi -AC_LANG_RESTORE -])dnl ACX_PTHREAD From 691b90f741faa344dbc63f711da7dbb712a75e3a Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 2 Nov 2022 12:00:27 +0100 Subject: [PATCH 23/68] updated action --- .github/workflows/mbt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index ddea987..7ab2dca 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -93,7 +93,7 @@ jobs: id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) echo "::set-output name=id::$id" - name: Static Code-check - run: cppcheck --enable=all --quiet --error-exitcode=0 . + run: cppcheck --enable=all --quiet --error-exitcode=0 -I include . - name: make run: make - name: install From 7fd3df5a9a0a6d94eb05b9b674079b4f873dfd1e Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 2 Nov 2022 12:00:41 +0100 Subject: [PATCH 24/68] small code improvement --- include/mbt/Tagger.h | 2 +- src/RunTagger.cxx | 2 +- src/Tagger.cxx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 0b9d6d6..fa3f33e 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -178,7 +178,7 @@ namespace Tagger { bool create_lexicons(); int ProcessFile( std::istream&, std::ostream& ); void ProcessTags( TagInfo * ); - void InitTest( const sentence&, std::vector&, MatchAction ); + void InitTest( const sentence&, const std::vector&, MatchAction ); bool NextBest( const sentence&, std::vector&, int, int ); const Timbl::TargetValue *Classify( MatchAction, const std::string&, const Timbl::ValueDistribution **distribution, diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 6cdf651..58b375b 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -637,7 +637,7 @@ namespace Tagger { } void TaggerClass::InitTest( const sentence& mySentence, - vector& TestPat, + const vector& TestPat, MatchAction Action ){ // Now make a testpattern for Timbl to process. string teststring = pat_to_string( mySentence, TestPat, Action, 0 ); diff --git a/src/Tagger.cxx b/src/Tagger.cxx index c660ef1..9413a26 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -75,7 +75,7 @@ namespace Tagger { class BeamData; - TaggerClass::TaggerClass( ){ + TaggerClass::TaggerClass(){ cur_log = new LogStream( cerr ); cur_log->setlevel( LogNormal ); cur_log->setstamp( StampMessage ); From 301a3fbe53b3c14b615b84f367b53622ffdaa7fa Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 12 Dec 2022 17:12:00 +0100 Subject: [PATCH 25/68] typo --- src/GenerateTagger.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index bec4fc7..9758cf6 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -627,7 +627,7 @@ namespace Tagger { } } if ( kwords < 0 || uwords < 0 ){ - cerr << "Generationg a tagger failed" << endl; + cerr << "Generating a tagger failed" << endl; return -1; } COUT << " ready: " << kwords << " words processed." From 23bc3dd84db0007ace2ee520a628952af4b76206 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 12 Dec 2022 17:13:41 +0100 Subject: [PATCH 26/68] updated GitHub action --- .github/workflows/mbt.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 7ab2dca..db7b821 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -33,6 +33,11 @@ jobs: compiler: [g++, clang++] steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.9.0 + with: + access_token: ${{ github.token }} + - uses: actions/checkout@v2 - name: Install Build Environment run: | @@ -91,7 +96,7 @@ jobs: id: compiler run: | id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) - echo "::set-output name=id::$id" + echo "id=$id" >> GITHUB_OUTPUT - name: Static Code-check run: cppcheck --enable=all --quiet --error-exitcode=0 -I include . - name: make From 613f124ca2880a2ecf0cf4f23e90cc393cedb7e0 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 19 Dec 2022 09:24:07 +0100 Subject: [PATCH 27/68] colourfull .yml --- .github/workflows/mbt.yml | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index db7b821..4f79dc8 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -7,6 +7,7 @@ on: - 'src/**' - 'include/**' - '.github/**' + - '.github/workflows/*' pull_request: branches: [master] @@ -15,18 +16,26 @@ jobs: notification: runs-on: ubuntu-latest name: Notify start to gitlama + outputs: + branch: ${{ steps.extract_branch.outputs.branch }} steps: + - name: Extract Branch name + id: extract_branch + shell: bash + run: echo "branch=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT - name: IRC notification uses: Gottox/irc-message-action@v1 with: server: irc.uvt.nl channel: '#gitlama' nickname: GitHub - message: |- - ${{ github.actor }} started a build of ${{ github.event.repository.name }} + message: > + ${{ github.actor }} started a build of Mbt + [${{ steps.extract_branch.outputs.branch }}] build: runs-on: ${{ matrix.os }} + needs: notification strategy: matrix: os: [ubuntu-latest, macos-latest] @@ -34,7 +43,7 @@ jobs: steps: - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.9.0 + uses: styfle/cancel-workflow-action@0.11.0 with: access_token: ${{ github.token }} @@ -96,7 +105,7 @@ jobs: id: compiler run: | id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) - echo "id=$id" >> GITHUB_OUTPUT + echo "id=$id" >> $GITHUB_OUTPUT - name: Static Code-check run: cppcheck --enable=all --quiet --error-exitcode=0 -I include . - name: make @@ -112,19 +121,17 @@ jobs: run: cat src/test-suite.log - name: Notify IRC of failure if: ${{ failure() }} - uses: Gottox/irc-message-action@v1 + uses: Gottox/irc-message-action@v2 with: server: irc.uvt.nl channel: '#gitlama' nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} - message: |- - ${{ github.event.repository.name }} with ${{ matrix.compiler }} build by ${{ github.actor }} on ${{ matrix.os }}: FAILED + message: "mbt [${{ needs.notification.outputs.branch }}] build with ${{ matrix.compiler }} by ${{ github.actor }} on ${{ matrix.os }}: \u00034FAIL\u0003" - name: Notify IRC of succes if: ${{ success() }} - uses: Gottox/irc-message-action@v1 + uses: Gottox/irc-message-action@v2 with: server: irc.uvt.nl channel: '#gitlama' nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} - message: |- - ${{ github.event.repository.name }} with ${{ matrix.compiler }} build by ${{ github.actor }} on ${{ matrix.os }}: SUCCESS + message: "mbt [${{ needs.notification.outputs.branch }}] build with ${{ matrix.compiler }} by ${{ github.actor }} on ${{ matrix.os }}: \u00033SUCCESS\u0003" From d6f0c3d63af2ad1267e96d6af8da1e0e8e831829 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 19 Dec 2022 14:56:12 +0100 Subject: [PATCH 28/68] require newest timbl --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index ba5d1aa..d56a257 100644 --- a/configure.ac +++ b/configure.ac @@ -63,7 +63,7 @@ CXXFLAGS="$CXXFLAGS $ICU_CFLAGS" LIBS="$ICU_LIBS $LIBS" -PKG_CHECK_MODULES([timbl], [timbl >= 6.4.14] ) +PKG_CHECK_MODULES([timbl], [timbl >= 6.8] ) CXXFLAGS="$CXXFLAGS $timbl_CFLAGS" LIBS="$LIBS $timbl_LIBS" From 51bb819892bdb611983a1e029c4ba7e4e33e2dc0 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 19 Dec 2022 14:56:53 +0100 Subject: [PATCH 29/68] update --- .github/workflows/mbt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 4f79dc8..66adf44 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -24,7 +24,7 @@ jobs: shell: bash run: echo "branch=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT - name: IRC notification - uses: Gottox/irc-message-action@v1 + uses: Gottox/irc-message-action@v2 with: server: irc.uvt.nl channel: '#gitlama' From 930f42fb7ce989d376ed4cff80d087e49bb22708 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 19 Dec 2022 14:57:25 +0100 Subject: [PATCH 30/68] some Unicode improvements --- include/mbt/Tagger.h | 11 ++++++----- src/RunTagger.cxx | 38 +++++++++++++++++++++----------------- src/Tagger.cxx | 8 ++++---- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index fa3f33e..2351edd 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -180,7 +180,8 @@ namespace Tagger { void ProcessTags( TagInfo * ); void InitTest( const sentence&, const std::vector&, MatchAction ); bool NextBest( const sentence&, std::vector&, int, int ); - const Timbl::TargetValue *Classify( MatchAction, const std::string&, + const Timbl::TargetValue *Classify( MatchAction, + const icu::UnicodeString&, const Timbl::ValueDistribution **distribution, double& ); void statistics( const sentence&, @@ -188,10 +189,10 @@ namespace Tagger { int& no_unknown, int& no_correct_known, int& no_correct_unknown ); - std::string pat_to_string( const sentence&, - const std::vector&, - MatchAction, - int ); + icu::UnicodeString pat_to_string( const sentence&, + const std::vector&, + MatchAction, + int ); std::string TimblOptStr; int FilterThreshold; diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 58b375b..62bd858 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -379,10 +379,10 @@ namespace Tagger { os << endl; } - string TaggerClass::pat_to_string( const sentence& mySentence, - const vector& pat, - MatchAction action, - int word ){ + UnicodeString TaggerClass::pat_to_string( const sentence& mySentence, + const vector& pat, + MatchAction action, + int word ){ int slots; if ( action == Unknown ){ slots = Utemplate.totalslots() - Utemplate.skipfocus; @@ -396,9 +396,7 @@ namespace Tagger { line += " "; } const vector enr = mySentence.getEnrichments(word); - for ( const auto& er: enr ){ - line += er + " "; - } + line += TiCC::join( enr, " " ); if ( input_kind != UNTAGGED ){ line += mySentence.gettag(word); } @@ -416,7 +414,7 @@ namespace Tagger { } cout << endl; } - return TiCC::UnicodeToUTF8(line); + return line; } void TaggerClass::read_lexicon( const string& FileName ){ @@ -604,7 +602,7 @@ namespace Tagger { #endif const TargetValue *TaggerClass::Classify( MatchAction Action, - const string& teststring, + const icu::UnicodeString& teststring, const ValueDistribution **distribution, double& distance ){ const TargetValue *answer = 0; @@ -616,12 +614,12 @@ namespace Tagger { timer1.start(); if ( Action == Known ){ timer2.start(); - answer = KnownTree->Classify( teststring, *distribution, distance ); + answer = KnownTree->Classify_u( teststring, *distribution, distance ); timer2.stop(); } else { timer3.start(); - answer = unKnownTree->Classify( teststring, *distribution, distance ); + answer = unKnownTree->Classify_u( teststring, *distribution, distance ); timer3.stop(); } timer1.stop(); @@ -640,10 +638,13 @@ namespace Tagger { const vector& TestPat, MatchAction Action ){ // Now make a testpattern for Timbl to process. - string teststring = pat_to_string( mySentence, TestPat, Action, 0 ); + UnicodeString test_string = pat_to_string( mySentence, TestPat, Action, 0 ); const ValueDistribution *distribution = 0; double distance; - const TargetValue *answer = Classify( Action, teststring, &distribution, distance ); + const TargetValue *answer = Classify( Action, + test_string, + &distribution, + distance ); distance_array.resize( mySentence.size() ); distribution_array.resize( mySentence.size() ); confidence_array.resize( mySentence.size() ); @@ -680,14 +681,17 @@ namespace Tagger { *kwordlist, TheLex, i_word, Beam->paths[beam_cnt] ) ){ // Now make a testpattern for Timbl to process. - string teststring = pat_to_string( mySentence, TestPat, Action, i_word ); - // process teststring to predict a category, using the + UnicodeString test_string = pat_to_string( mySentence, + TestPat, + Action, + i_word ); + // process test_string to predict a category, using the // appropriate tree // - // cerr << "teststring '" << teststring << "'" << endl; + // cerr << "test_string '" << test_string << "'" << endl; const ValueDistribution *distribution = 0; double distance; - const TargetValue *answer = Classify( Action, teststring, + const TargetValue *answer = Classify( Action, test_string, &distribution, distance ); if ( beam_cnt == 0 ){ if ( distance_flag ){ diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 9413a26..0eed71c 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -121,13 +121,13 @@ namespace Tagger { } TaggerClass::TaggerClass( const TaggerClass& in ): - cur_log( in.cur_log ), + cur_log( in.cur_log ), //!> is a pointer to avoid copies KnownTree( in.KnownTree ), unKnownTree( in.unKnownTree ), initialized( in.initialized ), - kwordlist( in.kwordlist ), - uwordlist( in.uwordlist ), - Beam( 0 ), + kwordlist( in.kwordlist ), //!> is a pointer to avoid copies + uwordlist( in.uwordlist ), //!> is a pointer to avoid copies + Beam( 0 ), //!> reset pointer input_kind( in.input_kind ), piped_input( in.piped_input ), lexflag( in.lexflag ), From a9b12b417d91c2d11260c4b89eabbbd3406602df Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 19 Dec 2022 17:28:53 +0100 Subject: [PATCH 31/68] fix --- src/RunTagger.cxx | 1 + src/TagLex.cxx | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 62bd858..8f5e879 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -397,6 +397,7 @@ namespace Tagger { } const vector enr = mySentence.getEnrichments(word); line += TiCC::join( enr, " " ); + line += " "; if ( input_kind != UNTAGGED ){ line += mySentence.gettag(word); } diff --git a/src/TagLex.cxx b/src/TagLex.cxx index ced23d2..c9b33d6 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -75,7 +75,7 @@ namespace Tagger { for( const auto& it : TagFreqs ){ result += it.first; result += ":"; - result += TiCC::UnicodeFromUTF8(TiCC::toString(it.second)); + result += TiCC::toUnicodeString(it.second); result += " "; } return result; From 3bebc5512edf4fda9e6c38ef106567546c434c30 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 20 Dec 2022 12:32:19 +0100 Subject: [PATCH 32/68] we can doe without the _u... --- src/RunTagger.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 8f5e879..e23e3f3 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -615,12 +615,12 @@ namespace Tagger { timer1.start(); if ( Action == Known ){ timer2.start(); - answer = KnownTree->Classify_u( teststring, *distribution, distance ); + answer = KnownTree->Classify( teststring, *distribution, distance ); timer2.stop(); } else { timer3.start(); - answer = unKnownTree->Classify_u( teststring, *distribution, distance ); + answer = unKnownTree->Classify( teststring, *distribution, distance ); timer3.stop(); } timer1.stop(); From f22497227e86d2aa723c55cac04719bb717f6ef9 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 24 Dec 2022 13:41:49 +0100 Subject: [PATCH 33/68] leaping into 2023 --- include/mbt/Logging.h | 2 +- include/mbt/MbtAPI.h | 2 +- include/mbt/Pattern.h | 2 +- include/mbt/Sentence.h | 2 +- include/mbt/TagLex.h | 2 +- include/mbt/Tagger.h | 2 +- src/GenerateTagger.cxx | 2 +- src/Mbt.cxx | 2 +- src/MbtAPI.cxx | 2 +- src/Mbtg.cxx | 2 +- src/Pattern.cxx | 2 +- src/RunTagger.cxx | 4 ++-- src/Sentence.cxx | 2 +- src/TagLex.cxx | 2 +- src/Tagger.cxx | 2 +- src/convert.cxx | 2 +- src/simpletest.cxx | 2 +- 17 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/mbt/Logging.h b/include/mbt/Logging.h index 5b70152..94e5a3c 100644 --- a/include/mbt/Logging.h +++ b/include/mbt/Logging.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/MbtAPI.h b/include/mbt/MbtAPI.h index 7a6e2cf..3976777 100644 --- a/include/mbt/MbtAPI.h +++ b/include/mbt/MbtAPI.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Pattern.h b/include/mbt/Pattern.h index 0f49b06..0e8427c 100644 --- a/include/mbt/Pattern.h +++ b/include/mbt/Pattern.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Sentence.h b/include/mbt/Sentence.h index 2073933..6e03529 100644 --- a/include/mbt/Sentence.h +++ b/include/mbt/Sentence.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/TagLex.h b/include/mbt/TagLex.h index 7dacb60..d9235c1 100644 --- a/include/mbt/TagLex.h +++ b/include/mbt/TagLex.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 2351edd..a081bf7 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 9758cf6..93fa253 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbt.cxx b/src/Mbt.cxx index e4550c8..73fc0f9 100644 --- a/src/Mbt.cxx +++ b/src/Mbt.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index 990b510..de489ff 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbtg.cxx b/src/Mbtg.cxx index 028002e..0233dcd 100644 --- a/src/Mbtg.cxx +++ b/src/Mbtg.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Pattern.cxx b/src/Pattern.cxx index cc92e49..8cc9062 100644 --- a/src/Pattern.cxx +++ b/src/Pattern.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index e23e3f3..1bc486f 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp @@ -1311,7 +1311,7 @@ namespace Tagger { void TaggerClass::manifest( const string& prog ){ // present yourself to the user // - cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2022." + cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2023." << endl << "Memory Based Tagger " << endl << "CLST - Centre for Language and Speech Technology," diff --git a/src/Sentence.cxx b/src/Sentence.cxx index 652be7f..db8088f 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/TagLex.cxx b/src/TagLex.cxx index c9b33d6..a471d51 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 0eed71c..f15eed9 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/convert.cxx b/src/convert.cxx index 5958a95..09f2364 100644 --- a/src/convert.cxx +++ b/src/convert.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/simpletest.cxx b/src/simpletest.cxx index afd8fea..e55d5f3 100644 --- a/src/simpletest.cxx +++ b/src/simpletest.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2022 + Copyright (c) 1998 - 2023 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp From 1010034446946902ae0f3094c6bbe19d8d82b1a1 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 2 Jan 2023 15:28:44 +0100 Subject: [PATCH 34/68] use newest functions from Timbl --- src/RunTagger.cxx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 1bc486f..1824a5b 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -237,11 +237,11 @@ namespace Tagger { } double sum_freq = 0.0; for ( const auto& it : *Dist ){ - UnicodeString name = it.second->Value()->name_u(); + UnicodeString name = it.second->Value()->name(); double freq = it.second->Weight(); sum_freq += freq; tmp = new name_prob_pair( name, freq ); - if ( name == PrefClass->name_u() ){ + if ( name == PrefClass->name() ){ assert( Pref == 0 ); Pref = tmp; } @@ -267,7 +267,7 @@ namespace Tagger { const TargetValue *answer, const ValueDistribution *distrib ){ if ( size == 1 ){ - paths[0][0] = TheLex.hash( answer->name_u() ); + paths[0][0] = TheLex.hash( answer->name() ); path_prob[0] = 1.0; } else { @@ -299,7 +299,7 @@ namespace Tagger { if ( size == 1 ){ n_best_array[0]->prob = 1.0; n_best_array[0]->path = beam_cnt; - n_best_array[0]->tag = TheLex.hash( answer->name_u() ); + n_best_array[0]->tag = TheLex.hash( answer->name() ); } else { DBG << "BeamData::NextPath[" << beam_cnt << "] ( " << answer << " , " From cd95a353fd5b7d6d327d67f14c90d853877b2d50 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 2 Jan 2023 15:45:44 +0100 Subject: [PATCH 35/68] NEWS about upcoming release --- NEWS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS b/NEWS index 32cc2a1..a6c3edb 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +mbt version 3.9 2023-01-02 +[Ko van der Sloot] +* start using the newest Timbl +* Better Unicode support +* modernizing code + mbt version 3.8 2022-07-22 [Maarten van Gompel] * updated metadata (codemeta.json) following new (proposed) CLARIAH requirements (CLARIAH/clariah-plus#38) From 5eb180fa6775eb94979bf6eedf52bd1ed1440003 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 2 Jan 2023 15:48:25 +0100 Subject: [PATCH 36/68] versions weren't updates after previous release --- codemeta.json | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codemeta.json b/codemeta.json index 9b31ef8..c59cfa6 100644 --- a/codemeta.json +++ b/codemeta.json @@ -7,7 +7,7 @@ "@type": "SoftwareSourceCode", "identifier": "mbt", "name": "mbt", - "version": "3.8", + "version": "3.9", "description": "MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech. ", "license": "https://spdx.org/licenses/GPL-3.0", "url": "https://languagemachines.github.io/mbt", diff --git a/configure.ac b/configure.ac index d56a257..e6a19b7 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.69]) -AC_INIT([mbt],[3.8],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! +AC_INIT([mbt],[3.9],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) From af100a833b051c5d7b4a80240cf3714bcdba42a3 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 2 Jan 2023 15:50:04 +0100 Subject: [PATCH 37/68] bumoed version after release --- codemeta.json | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codemeta.json b/codemeta.json index c59cfa6..1a36903 100644 --- a/codemeta.json +++ b/codemeta.json @@ -7,7 +7,7 @@ "@type": "SoftwareSourceCode", "identifier": "mbt", "name": "mbt", - "version": "3.9", + "version": "3.10", "description": "MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech. ", "license": "https://spdx.org/licenses/GPL-3.0", "url": "https://languagemachines.github.io/mbt", diff --git a/configure.ac b/configure.ac index e6a19b7..a1dda52 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.69]) -AC_INIT([mbt],[3.9],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! +AC_INIT([mbt],[3.10],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) From b77858d70e35410d57451dfda6e4fc2771ef2b33 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 4 Jan 2023 15:59:56 +0100 Subject: [PATCH 38/68] Timbl::ValueDistribution is renamed to ClassDistribution --- src/RunTagger.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 1824a5b..fef7b5c 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -225,7 +225,7 @@ namespace Tagger { } - name_prob_pair *break_down( const ValueDistribution *Dist, + name_prob_pair *break_down( const ClassDistribution *Dist, const TargetValue *PrefClass ){ // split a distribution into names/probabilities AND sort them descending // But put preferred in front. @@ -265,7 +265,7 @@ namespace Tagger { void BeamData::InitPaths( UnicodeHash& TheLex, const TargetValue *answer, - const ValueDistribution *distrib ){ + const ClassDistribution *distrib ){ if ( size == 1 ){ paths[0][0] = TheLex.hash( answer->name() ); path_prob[0] = 1.0; @@ -294,7 +294,7 @@ namespace Tagger { void BeamData::NextPath( UnicodeHash& TheLex, const TargetValue *answer, - const ValueDistribution *distrib, + const ClassDistribution *distrib, int beam_cnt ){ if ( size == 1 ){ n_best_array[0]->prob = 1.0; @@ -604,7 +604,7 @@ namespace Tagger { const TargetValue *TaggerClass::Classify( MatchAction Action, const icu::UnicodeString& teststring, - const ValueDistribution **distribution, + const ClassDistribution **distribution, double& distance ){ const TargetValue *answer = 0; #if defined(HAVE_PTHREAD) @@ -640,7 +640,7 @@ namespace Tagger { MatchAction Action ){ // Now make a testpattern for Timbl to process. UnicodeString test_string = pat_to_string( mySentence, TestPat, Action, 0 ); - const ValueDistribution *distribution = 0; + const ClassDistribution *distribution = 0; double distance; const TargetValue *answer = Classify( Action, test_string, @@ -690,7 +690,7 @@ namespace Tagger { // appropriate tree // // cerr << "test_string '" << test_string << "'" << endl; - const ValueDistribution *distribution = 0; + const ClassDistribution *distribution = 0; double distance; const TargetValue *answer = Classify( Action, test_string, &distribution, distance ); From d1a7df6973f15135d482869a20ba404d7d0eb56c Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 4 Jan 2023 16:16:14 +0100 Subject: [PATCH 39/68] replaced ugly ** by les ugly *& --- include/mbt/Tagger.h | 2 +- src/RunTagger.cxx | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index a081bf7..553299f 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -182,7 +182,7 @@ namespace Tagger { bool NextBest( const sentence&, std::vector&, int, int ); const Timbl::TargetValue *Classify( MatchAction, const icu::UnicodeString&, - const Timbl::ValueDistribution **distribution, + const Timbl::ValueDistribution *&, double& ); void statistics( const sentence&, int& no_known, diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index fef7b5c..9da429b 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -604,7 +604,7 @@ namespace Tagger { const TargetValue *TaggerClass::Classify( MatchAction Action, const icu::UnicodeString& teststring, - const ClassDistribution **distribution, + const ClassDistribution *&distribution, double& distance ){ const TargetValue *answer = 0; #if defined(HAVE_PTHREAD) @@ -615,12 +615,12 @@ namespace Tagger { timer1.start(); if ( Action == Known ){ timer2.start(); - answer = KnownTree->Classify( teststring, *distribution, distance ); + answer = KnownTree->Classify( teststring, distribution, distance ); timer2.stop(); } else { timer3.start(); - answer = unKnownTree->Classify( teststring, *distribution, distance ); + answer = unKnownTree->Classify( teststring, distribution, distance ); timer3.stop(); } timer1.stop(); @@ -644,7 +644,7 @@ namespace Tagger { double distance; const TargetValue *answer = Classify( Action, test_string, - &distribution, + distribution, distance ); distance_array.resize( mySentence.size() ); distribution_array.resize( mySentence.size() ); @@ -693,7 +693,7 @@ namespace Tagger { const ClassDistribution *distribution = 0; double distance; const TargetValue *answer = Classify( Action, test_string, - &distribution, distance ); + distribution, distance ); if ( beam_cnt == 0 ){ if ( distance_flag ){ distance_array[i_word] = distance; From b824e27e69190153ac68eaf4a70a2f069c51cd76 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 4 Jan 2023 17:53:10 +0100 Subject: [PATCH 40/68] ValueDistribution ==> ClassDistribution --- include/mbt/Tagger.h | 8 ++++---- src/RunTagger.cxx | 8 +++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 553299f..8c42f14 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -68,10 +68,10 @@ namespace Tagger { bool Init( int, unsigned int ); void InitPaths( Hash::UnicodeHash&, const Timbl::TargetValue *, - const Timbl::ValueDistribution * ); + const Timbl::ClassDistribution * ); void NextPath( Hash::UnicodeHash&, const Timbl::TargetValue *, - const Timbl::ValueDistribution *, + const Timbl::ClassDistribution *, int ); void ClearBest(); void Shift( int, int ); @@ -80,7 +80,7 @@ namespace Tagger { int size; int **paths; int **temppaths; - double *path_prob; + std::vector path_prob; n_best_tuple **n_best_array; private: BeamData( const BeamData& ); // inhibit copies @@ -182,7 +182,7 @@ namespace Tagger { bool NextBest( const sentence&, std::vector&, int, int ); const Timbl::TargetValue *Classify( MatchAction, const icu::UnicodeString&, - const Timbl::ValueDistribution *&, + const Timbl::ClassDistribution *&, double& ); void statistics( const sentence&, int& no_known, diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 9da429b..d0ea257 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -70,7 +70,6 @@ namespace Tagger { size = 0; paths = 0; temppaths = 0; - path_prob = 0; n_best_array = 0; } @@ -84,16 +83,15 @@ namespace Tagger { } delete [] paths; delete [] temppaths; - delete [] path_prob; delete [] n_best_array; } bool BeamData::Init( int Size, unsigned int noWords ){ // Beaming Stuff... - if ( path_prob == 0 ){ + if ( path_prob.size() == 0 ){ // the first time - if ( (path_prob = new double[Size]) == 0 || - (n_best_array = new n_best_tuple*[Size]) == 0 || + path_prob.resize(Size); + if ( (n_best_array = new n_best_tuple*[Size]) == 0 || (paths = new int*[Size]) == 0 || (temppaths = new int*[Size]) == 0 ){ throw runtime_error( "Beam: not enough memory for N-best search tables" ); From 73da15d93fe3db2deed94235d98e7e3afd878c3b Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 19 Jan 2023 12:26:15 +0100 Subject: [PATCH 41/68] removed dependency on libtar --- .github/workflows/mbt.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 66adf44..1b69072 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -63,12 +63,11 @@ jobs: run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get install libicu-dev libxml2-dev libbz2-dev - sudo apt-get install zlib1g-dev libtar-dev cppcheck + sudo apt-get install zlib1g-dev cppcheck else brew install libxml2 brew install bzip2 brew install zlib - brew install libtar brew install cppcheck fi - name: install TiccUtils From c3f81784b54fdc1279f995e86c16dbed6676ebf4 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 3 Feb 2023 08:54:21 +0100 Subject: [PATCH 42/68] removed use of deprecated sprintf function. Needs most recent ticcutils now --- configure.ac | 2 +- src/Tagger.cxx | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index a1dda52..47e7d2b 100644 --- a/configure.ac +++ b/configure.ac @@ -67,7 +67,7 @@ PKG_CHECK_MODULES([timbl], [timbl >= 6.8] ) CXXFLAGS="$CXXFLAGS $timbl_CFLAGS" LIBS="$LIBS $timbl_LIBS" -PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.28] ) +PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.32] ) CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS" LIBS="$LIBS $ticcutils_LIBS" diff --git a/src/Tagger.cxx b/src/Tagger.cxx index f15eed9..9efe0d0 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -340,16 +340,13 @@ namespace Tagger { return false; } } - char affix[32]; + LexFileBaseName = TestFileName; LexFileBaseName += ".lex"; LexFileName = prefixWithAbsolutePath( LexFileBaseName, SettingsFilePath ); - if ( FilterThreshold < 10 ){ - sprintf( affix, ".0%1i", FilterThreshold ); - } - else { - sprintf( affix, ".%2i", FilterThreshold ); - } + string affix = std::to_string( FilterThreshold ); + affix = TiCC::pad( affix, 2, '0' ); + affix = "." + affix; if ( !knownoutfileflag ){ K_option_name = TestFileName + ".known.inst." + KtmplStr; K_option_name = prefixWithAbsolutePath( K_option_name, @@ -384,12 +381,14 @@ namespace Tagger { TopNFileName = TopNFileBaseName; } else { - sprintf( affix, ".top%d", TopNumber ); + affix = std::to_string( TopNumber ); + affix = ".top" + affix; TopNFileBaseName = TestFileName + affix; TopNFileName = prefixWithAbsolutePath( TopNFileBaseName, SettingsFilePath ); } - sprintf( affix, ".%dpaxes", Npax ); + affix = std::to_string( Npax ); + affix = "." + affix + "paxes"; NpaxFileBaseName = TestFileName + affix; NpaxFileName = prefixWithAbsolutePath( NpaxFileBaseName, SettingsFilePath ); From 28fa0d53aa1704c7528da428cdbbcd917a12925c Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 9 Feb 2023 13:18:08 +0100 Subject: [PATCH 43/68] numb change --- include/mbt/Tagger.h | 4 ---- src/MbtAPI.cxx | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 8c42f14..f2f2120 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -277,10 +277,6 @@ namespace Tagger { bool _known; }; - inline void RemoveTagger( TaggerClass* tagger ){ - delete tagger; - } - std::vector StringToTR( const std::string&, bool=false ); const icu::UnicodeString& indexlex( const unsigned int, Hash::UnicodeHash& ); diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index de489ff..65cd80d 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -92,7 +92,7 @@ MbtAPI::MbtAPI( const string& optstring, TiCC::LogStream& ls ){ } MbtAPI::~MbtAPI(){ - RemoveTagger( tagger ); + delete tagger; } bool MbtAPI::isInit() const{ From 915cf4d70187f9c69f811bdfaeae6d0a1c09a839 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 18 Feb 2023 11:26:00 +0100 Subject: [PATCH 44/68] CppCheck happy --- src/Tagger.cxx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 9efe0d0..2d836b0 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -75,8 +75,9 @@ namespace Tagger { class BeamData; - TaggerClass::TaggerClass(){ - cur_log = new LogStream( cerr ); + TaggerClass::TaggerClass(): + cur_log(new LogStream( cerr )) + { cur_log->setlevel( LogNormal ); cur_log->setstamp( StampMessage ); default_cout.setstamp( NoStamp ); From fd7cb7ebdd52bef2794f16f329569bedad3143e9 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 29 Apr 2023 10:34:36 +0200 Subject: [PATCH 45/68] modernizing --- .github/workflows/mbt.yml | 2 +- src/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 1b69072..5d989f2 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -47,7 +47,7 @@ jobs: with: access_token: ${{ github.token }} - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install Build Environment run: | if [ "$RUNNER_OS" == "Linux" ]; then diff --git a/src/Makefile.am b/src/Makefile.am index 71d0a41..baeecb6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,5 @@ AM_CPPFLAGS = -I@top_srcdir@/include -AM_CXXFLAGS = -std=c++11 -W -Wall -g -pedantic -O3 +AM_CXXFLAGS = -std=c++14 -W -Wall -g -pedantic -O3 LDADD = libmbt.la From fa2011c53f80577ca8d042a42ccf344628f385b5 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sun, 24 Sep 2023 10:48:42 +0200 Subject: [PATCH 46/68] static_cast is 'better" --- src/TagLex.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TagLex.cxx b/src/TagLex.cxx index a471d51..24ac76f 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -171,7 +171,7 @@ namespace Tagger { vector TagLex::CreateSortedVector(){ vector TagVec; - TagTree->ForEachDo( StoreInVector, (void *)&TagVec ); + TagTree->ForEachDo( StoreInVector, static_cast(&TagVec) ); sort( TagVec.begin(), TagVec.end() , ascendingInfo ); return TagVec; } From a46e90eb13f4f5e2f444f52472503b4f7eef3b43 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 18 Oct 2023 16:37:31 +0200 Subject: [PATCH 47/68] NFC normalization is now performed by TiCC::getline() --- configure.ac | 2 +- src/GenerateTagger.cxx | 2 -- src/Sentence.cxx | 12 +++--------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/configure.ac b/configure.ac index 47e7d2b..87f35e1 100644 --- a/configure.ac +++ b/configure.ac @@ -67,7 +67,7 @@ PKG_CHECK_MODULES([timbl], [timbl >= 6.8] ) CXXFLAGS="$CXXFLAGS $timbl_CFLAGS" LIBS="$LIBS $timbl_LIBS" -PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.32] ) +PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.34] ) CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS" LIBS="$LIBS $ticcutils_LIBS" diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 93fa253..7f4a795 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -105,11 +105,9 @@ namespace Tagger { cerr << "couldn't open inputfile " << filename << endl; return false; } - TiCC::UnicodeNormalizer nfc_normalizer; map TagList; UnicodeString buffer; while ( TiCC::getline( lex_file, buffer ) ){ - buffer = nfc_normalizer.normalize( buffer ); UnicodeString word, tag; if ( split_special( buffer, word, tag ) ){ TaggedLexicon.Store( word, tag ); diff --git a/src/Sentence.cxx b/src/Sentence.cxx index db8088f..dad5a6f 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -478,7 +478,6 @@ namespace Tagger { size_t& line_no ){ // read a whole sentence from a stream // A sentence can be delimited either by an Eos marker or EOF. - static TiCC::UnicodeNormalizer nfc_normalizer; clear(); UnicodeString line; while ( TiCC::getline( infile, line ) ){ @@ -494,7 +493,6 @@ namespace Tagger { else if ( Utt_Terminator( line ) ){ return true; } - line = nfc_normalizer.normalize( line ); vector parts = TiCC::split_at_first_of( line, seps ); if ( parts.size() != 2 ){ #pragma omp critical (errors) @@ -522,7 +520,6 @@ namespace Tagger { size_t& line_no ){ // read a whole sentence from a stream // A sentence can be delimited either by an Eos marker or EOF. - static TiCC::UnicodeNormalizer nfc_normalizer; clear(); // cerr << "untagged-read remainder='" << remainder << "'" << endl; UnicodeString line = remainder; @@ -530,15 +527,14 @@ namespace Tagger { while ( !line.isEmpty() || TiCC::getline( infile, line ) ){ ++line_no; // cerr << "untagged-read line: " << line << endl; - UnicodeString u_line = nfc_normalizer.normalize( line ); - u_line.trim(); - if ( u_line.isEmpty() ){ + line.trim(); + if ( line.isEmpty() ){ if ( InternalEosMark == "EL" ){ return true; } continue; } - vector parts = TiCC::split_at_first_of( u_line, seps ); + vector parts = TiCC::split_at_first_of( line, seps ); line = ""; bool terminated = false; for ( const auto& p : parts ){ @@ -566,7 +562,6 @@ namespace Tagger { // read a sequence of enriched and tagged words from infile // every word must be a one_liner // cleanup the sentence for re-use... - static TiCC::UnicodeNormalizer nfc_normalizer; clear(); UnicodeString line; while( TiCC::getline( infile, line ) ){ @@ -581,7 +576,6 @@ namespace Tagger { else if ( Utt_Terminator( line ) ){ return true; } - line = nfc_normalizer.normalize( line ); vector extras = TiCC::split_at_first_of( line, seps ); if ( extras.size() >= 2 ){ UnicodeString Word = extras.front(); From 28b38576f2b02e0eea2c400069bb12f2d2775be4 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 21 Oct 2023 00:13:49 +0200 Subject: [PATCH 48/68] const correctness --- include/mbt/Sentence.h | 3 ++- include/mbt/TagLex.h | 4 ++-- src/Sentence.cxx | 2 +- src/TagLex.cxx | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/mbt/Sentence.h b/include/mbt/Sentence.h index 6e03529..015b971 100644 --- a/include/mbt/Sentence.h +++ b/include/mbt/Sentence.h @@ -69,7 +69,8 @@ namespace Tagger { sentence( const PatTemplate&, const PatTemplate& ); ~sentence(); void clear(); - bool init_windowing( std::map&, UnicodeHash& ); + bool init_windowing( const std::map&, + UnicodeHash& ); bool nextpat( MatchAction&, std::vector&, UnicodeHash& , UnicodeHash&, unsigned int, int * = 0 ) const; int classify_hapax( const icu::UnicodeString&, UnicodeHash& ) const; diff --git a/include/mbt/TagLex.h b/include/mbt/TagLex.h index d9235c1..e07c340 100644 --- a/include/mbt/TagLex.h +++ b/include/mbt/TagLex.h @@ -36,7 +36,7 @@ namespace Tagger { // a Tagged Lexion. Stores strings , frequencies and assigned tags class TagInfo { - friend std::ostream& operator<<( std::ostream&, TagInfo * ); + friend std::ostream& operator<<( std::ostream&, const TagInfo * ); public: TagInfo( const icu::UnicodeString& , const icu::UnicodeString& ); @@ -55,7 +55,7 @@ namespace Tagger { }; class TagLex { - friend std::ostream& operator<< ( std::ostream&, TagLex * ); + friend std::ostream& operator<< ( std::ostream&, const TagLex * ); public: TagLex(); ~TagLex(); diff --git a/src/Sentence.cxx b/src/Sentence.cxx index dad5a6f..48bc5e3 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -164,7 +164,7 @@ namespace Tagger { add(a_word, tmp, a_tag); } - bool sentence::init_windowing( map& lex, + bool sentence::init_windowing( const map& lex, UnicodeHash& TheLex ) { if ( UTAG == -1 ){ #pragma omp critical (hasher) diff --git a/src/TagLex.cxx b/src/TagLex.cxx index 24ac76f..c136378 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -107,7 +107,7 @@ namespace Tagger { StringRepr = tmpstr; } - ostream& operator<<( ostream& os, TagInfo *LI ){ + ostream& operator<<( ostream& os, const TagInfo *LI ){ if ( LI ){ os << " " << LI->Word << ":" << LI->WordFreq << " {" << LI->DisplayTagFreqs() << "} " << LI->StringRepr; @@ -176,7 +176,7 @@ namespace Tagger { return TagVec; } - ostream& operator<<( ostream& os, TagLex *L ){ + ostream& operator<<( ostream& os, const TagLex *L ){ return os << L->TagTree; } } From 8160f7b3c1d7627edcc90fb38042f4e02b78582a Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 21 Oct 2023 18:35:11 +0200 Subject: [PATCH 49/68] news about release --- NEWS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS b/NEWS index a6c3edb..9441a7f 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +mbt version 3.10 2023-10-21 +[Ko van der Sloot] +* removed dependency on libtar +* Timbl::ClassDistribution replaces Timbl::ValueDistribution +* modernizing code +* use ticcutils > 0.34 to have NF normalization of Unicode + mbt version 3.9 2023-01-02 [Ko van der Sloot] * start using the newest Timbl From 155e6e79015b692bbb84e1a4b49c5b501c620473 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 21 Oct 2023 18:37:09 +0200 Subject: [PATCH 50/68] bump version after release --- codemeta.json | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codemeta.json b/codemeta.json index 1a36903..cd0a29d 100644 --- a/codemeta.json +++ b/codemeta.json @@ -7,7 +7,7 @@ "@type": "SoftwareSourceCode", "identifier": "mbt", "name": "mbt", - "version": "3.10", + "version": "3.11", "description": "MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech. ", "license": "https://spdx.org/licenses/GPL-3.0", "url": "https://languagemachines.github.io/mbt", diff --git a/configure.ac b/configure.ac index 87f35e1..3c413ce 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.69]) -AC_INIT([mbt],[3.10],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! +AC_INIT([mbt],[3.11],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) From fbd329dffaf67e9231d24824bcd8504c641f726a Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Wed, 1 Nov 2023 10:17:11 +0100 Subject: [PATCH 51/68] update mbt.yml --- .github/workflows/mbt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 5d989f2..d8f8641 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -106,7 +106,7 @@ jobs: id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) echo "id=$id" >> $GITHUB_OUTPUT - name: Static Code-check - run: cppcheck --enable=all --quiet --error-exitcode=0 -I include . + run: cppcheck --enable=all --suppress=missingIncludeSystem -I include -I /usr/local/include --quiet --error-exitcode=0 -I include . - name: make run: make - name: install From 429181a669bd787319333f4063a90d58d13966c0 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 14 Nov 2023 19:51:11 +0100 Subject: [PATCH 52/68] added cleanup action --- .github/workflows/cleanup.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/workflows/cleanup.yml diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml new file mode 100644 index 0000000..99d4b40 --- /dev/null +++ b/.github/workflows/cleanup.yml @@ -0,0 +1,19 @@ +name: Delete old workflow runs +on: + schedule: + - cron: '0 0 1 * *' +# Run monthly, at 00:00 on the 1t day of month. (testing) + +jobs: + del_runs: + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - name: Delete workflow runs + uses: Mattraks/delete-workflow-runs@v2 + with: + token: ${{ github.token }} + repository: ${{ github.repository }} + retain_days: 30 + keep_minimum_runs: 6 From 5e55d6dd33b7000eab941fdee66b112476add17d Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Fri, 26 Jan 2024 15:08:24 +0100 Subject: [PATCH 53/68] we entered 2024 --- include/mbt/Logging.h | 2 +- include/mbt/MbtAPI.h | 2 +- include/mbt/Pattern.h | 2 +- include/mbt/Sentence.h | 2 +- include/mbt/TagLex.h | 2 +- include/mbt/Tagger.h | 2 +- src/GenerateTagger.cxx | 2 +- src/Mbt.cxx | 2 +- src/MbtAPI.cxx | 2 +- src/Mbtg.cxx | 2 +- src/Pattern.cxx | 2 +- src/RunTagger.cxx | 4 ++-- src/Sentence.cxx | 2 +- src/TagLex.cxx | 2 +- src/Tagger.cxx | 2 +- src/convert.cxx | 2 +- src/simpletest.cxx | 2 +- 17 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/mbt/Logging.h b/include/mbt/Logging.h index 94e5a3c..92a3b51 100644 --- a/include/mbt/Logging.h +++ b/include/mbt/Logging.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/MbtAPI.h b/include/mbt/MbtAPI.h index 3976777..7995553 100644 --- a/include/mbt/MbtAPI.h +++ b/include/mbt/MbtAPI.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Pattern.h b/include/mbt/Pattern.h index 0e8427c..ec7d35d 100644 --- a/include/mbt/Pattern.h +++ b/include/mbt/Pattern.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Sentence.h b/include/mbt/Sentence.h index 015b971..65d4330 100644 --- a/include/mbt/Sentence.h +++ b/include/mbt/Sentence.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/TagLex.h b/include/mbt/TagLex.h index e07c340..c39fa78 100644 --- a/include/mbt/TagLex.h +++ b/include/mbt/TagLex.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index f2f2120..94e297b 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 7f4a795..f5a3feb 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbt.cxx b/src/Mbt.cxx index 73fc0f9..ad4af3b 100644 --- a/src/Mbt.cxx +++ b/src/Mbt.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index 65cd80d..9a31066 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Mbtg.cxx b/src/Mbtg.cxx index 0233dcd..43693ac 100644 --- a/src/Mbtg.cxx +++ b/src/Mbtg.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Pattern.cxx b/src/Pattern.cxx index 8cc9062..a3ca0db 100644 --- a/src/Pattern.cxx +++ b/src/Pattern.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index d0ea257..3c9889c 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp @@ -1309,7 +1309,7 @@ namespace Tagger { void TaggerClass::manifest( const string& prog ){ // present yourself to the user // - cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2023." + cerr << prog << " " << VERSION << " (c) CLST, ILK and CLiPS 1998 - 2024." << endl << "Memory Based Tagger " << endl << "CLST - Centre for Language and Speech Technology," diff --git a/src/Sentence.cxx b/src/Sentence.cxx index 48bc5e3..9fe63b8 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/TagLex.cxx b/src/TagLex.cxx index c136378..94649dc 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 2d836b0..5acbc25 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/convert.cxx b/src/convert.cxx index 09f2364..8f67feb 100644 --- a/src/convert.cxx +++ b/src/convert.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp diff --git a/src/simpletest.cxx b/src/simpletest.cxx index e55d5f3..c6fb8b9 100644 --- a/src/simpletest.cxx +++ b/src/simpletest.cxx @@ -1,5 +1,5 @@ /* - Copyright (c) 1998 - 2023 + Copyright (c) 1998 - 2024 CLST - Radboud University ILK - Tilburg University CLiPS - University of Antwerp From c56df541bbdf5ac51de23081714e198ad0bbb430 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 6 Feb 2024 11:50:07 +0100 Subject: [PATCH 54/68] update actions --- .github/workflows/mbt.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index d8f8641..048c883 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -43,11 +43,11 @@ jobs: steps: - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.11.0 + uses: styfle/cancel-workflow-action@0.12.1 with: access_token: ${{ github.token }} - - uses: actions/checkout@v3 + - uses: actions/checkout@v4.1.1 - name: Install Build Environment run: | if [ "$RUNNER_OS" == "Linux" ]; then From bef580570cfd77d7f2bd7482b279dcb41929ffa3 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Sat, 24 Feb 2024 16:48:28 +0100 Subject: [PATCH 55/68] start getting rid of ponters to pointers and such --- include/mbt/Tagger.h | 2 +- src/RunTagger.cxx | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 94e297b..dd75db1 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -81,7 +81,7 @@ namespace Tagger { int **paths; int **temppaths; std::vector path_prob; - n_best_tuple **n_best_array; + std::vectorn_best_array; private: BeamData( const BeamData& ); // inhibit copies BeamData& operator=( const BeamData& ); // inhibit copies diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 3c9889c..1af3539 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -70,7 +70,6 @@ namespace Tagger { size = 0; paths = 0; temppaths = 0; - n_best_array = 0; } BeamData::~BeamData(){ @@ -83,7 +82,6 @@ namespace Tagger { } delete [] paths; delete [] temppaths; - delete [] n_best_array; } bool BeamData::Init( int Size, unsigned int noWords ){ @@ -91,8 +89,8 @@ namespace Tagger { if ( path_prob.size() == 0 ){ // the first time path_prob.resize(Size); - if ( (n_best_array = new n_best_tuple*[Size]) == 0 || - (paths = new int*[Size]) == 0 || + n_best_array.resize(Size); + if ( (paths = new int*[Size]) == 0 || (temppaths = new int*[Size]) == 0 ){ throw runtime_error( "Beam: not enough memory for N-best search tables" ); } @@ -100,9 +98,7 @@ namespace Tagger { for ( int q=0; q < Size; ++q ){ paths[q] = 0; temppaths[q] = 0; - if ( (n_best_array[q] = new n_best_tuple) == 0 ){ - throw runtime_error( "Beam: not enough memory for N-best search tables" ); - } + n_best_array[q] = new n_best_tuple(); } } } @@ -671,14 +667,18 @@ namespace Tagger { bool TaggerClass::NextBest( const sentence& mySentence, vector& TestPat, - int i_word, int beam_cnt ){ + int i_word, + int beam_cnt ){ MatchAction Action = Unknown; if ( Beam->paths[beam_cnt][i_word-1] == EMPTY_PATH ){ return false; } - else if ( mySentence.nextpat( Action, TestPat, - *kwordlist, TheLex, - i_word, Beam->paths[beam_cnt] ) ){ + else if ( !mySentence.nextpat( Action, TestPat, + *kwordlist, TheLex, + i_word, Beam->paths[beam_cnt] ) ){ + return false; + } + else { // Now make a testpattern for Timbl to process. UnicodeString test_string = pat_to_string( mySentence, TestPat, @@ -715,9 +715,6 @@ namespace Tagger { } return true; } - else { - return false; - } } int TaggerClass::TagLine( const UnicodeString& inp, UnicodeString& result ){ From 98c57bc232fa8b3bccc481f0c435d1f6050db817 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 26 Feb 2024 12:35:52 +0100 Subject: [PATCH 56/68] next step refactoring --- include/mbt/Tagger.h | 2 +- src/RunTagger.cxx | 46 +++++++++++++++++++++----------------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index dd75db1..1180664 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -81,7 +81,7 @@ namespace Tagger { int **paths; int **temppaths; std::vector path_prob; - std::vectorn_best_array; + std::vectorn_best_array; private: BeamData( const BeamData& ); // inhibit copies BeamData& operator=( const BeamData& ); // inhibit copies diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 1af3539..4d0810c 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -75,7 +75,6 @@ namespace Tagger { BeamData::~BeamData(){ if ( paths ){ for ( int q=0; q < size; ++q ){ - delete n_best_array[q]; delete [] paths[q]; delete [] temppaths[q]; } @@ -98,7 +97,6 @@ namespace Tagger { for ( int q=0; q < Size; ++q ){ paths[q] = 0; temppaths[q] = 0; - n_best_array[q] = new n_best_tuple(); } } } @@ -121,24 +119,24 @@ namespace Tagger { void BeamData::ClearBest(){ DBG << "clearing n_best_array..." << endl; for ( int i=0; i < size; ++i ){ - n_best_array[i]->clean(); + n_best_array[i].clean(); } } void BeamData::Shift( int no_words, int i_word ){ for ( int q1 = 0; q1 < no_words; ++q1 ){ for ( int jb = 0; jb < size; ++jb ){ - path_prob[jb] = n_best_array[jb]->prob; - if ( n_best_array[jb]->path != EMPTY_PATH ){ + path_prob[jb] = n_best_array[jb].prob; + if ( n_best_array[jb].path != EMPTY_PATH ){ if ( q1 < i_word ){ - DBG << "shift paths[" << n_best_array[jb]->path << "," + DBG << "shift paths[" << n_best_array[jb].path << "," << q1 << "] into paths[" << jb << "," << q1 << "]" << endl; - temppaths[jb][q1] = paths[n_best_array[jb]->path][q1]; + temppaths[jb][q1] = paths[n_best_array[jb].path][q1]; } else if ( q1 == i_word ){ - DBG << "shift tag " << n_best_array[jb]->tag + DBG << "shift tag " << n_best_array[jb].tag << " into paths[" << jb << "," << q1 << "]" << endl; - temppaths[jb][q1] = n_best_array[jb]->tag; + temppaths[jb][q1] = n_best_array[jb].tag; } else { temppaths[jb][q1] = EMPTY_PATH; @@ -175,15 +173,15 @@ namespace Tagger { void BeamData::PrintBest( ostream& os, UnicodeHash& TheLex ){ for ( int i=0; i < size; ++i ){ - if ( n_best_array[i]->path != EMPTY_PATH ){ + if ( n_best_array[i].path != EMPTY_PATH ){ os << "n_best_array[" << i << "] = " - << n_best_array[i]->prob << " " - << n_best_array[i]->path << " " - << indexlex( n_best_array[i]->tag, TheLex ) << endl; + << n_best_array[i].prob << " " + << n_best_array[i].path << " " + << indexlex( n_best_array[i].tag, TheLex ) << endl; } else { os << "n_best_array[" << i << "] = " - << n_best_array[i]->prob << " EMPTY " << endl; + << n_best_array[i].prob << " EMPTY " << endl; } } } @@ -291,9 +289,9 @@ namespace Tagger { const ClassDistribution *distrib, int beam_cnt ){ if ( size == 1 ){ - n_best_array[0]->prob = 1.0; - n_best_array[0]->path = beam_cnt; - n_best_array[0]->tag = TheLex.hash( answer->name() ); + n_best_array[0].prob = 1.0; + n_best_array[0].path = beam_cnt; + n_best_array[0].tag = TheLex.hash( answer->name() ); } else { DBG << "BeamData::NextPath[" << beam_cnt << "] ( " << answer << " , " @@ -308,27 +306,27 @@ namespace Tagger { double thisPProb = thisWProb * path_prob[beam_cnt]; int dtag = TheLex.hash( d_pnt->name ); for ( int ane = size-1; ane >=0; --ane ){ - if ( thisPProb <= n_best_array[ane]->prob ) + if ( thisPProb <= n_best_array[ane].prob ) break; if ( ane == 0 || - thisPProb <= n_best_array[ane-1]->prob ){ + thisPProb <= n_best_array[ane-1].prob ){ if ( ane == 0 ){ DBG << "Insert, n=0" << endl; } else { DBG << "Insert, n=" << ane << " Prob = " << thisPProb - << " after prob = " << n_best_array[ane-1]->prob + << " after prob = " << n_best_array[ane-1].prob << endl; } // shift - n_best_tuple *keep = n_best_array[size-1]; + n_best_tuple keep = n_best_array[size-1]; for ( int ash = size-1; ash > ane; --ash ){ n_best_array[ash] = n_best_array[ash-1]; } n_best_array[ane] = keep; - n_best_array[ane]->prob = thisPProb; - n_best_array[ane]->path = beam_cnt; - n_best_array[ane]->tag = dtag; + n_best_array[ane].prob = thisPProb; + n_best_array[ane].path = beam_cnt; + n_best_array[ane].tag = dtag; } } } From 29d62a89612258730588e2df71da75f226c44baa Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 26 Feb 2024 13:25:56 +0100 Subject: [PATCH 57/68] more refeactoring done --- include/mbt/Sentence.h | 8 ++++++-- include/mbt/Tagger.h | 4 ++-- src/GenerateTagger.cxx | 3 ++- src/RunTagger.cxx | 40 +++++++--------------------------------- src/Sentence.cxx | 5 +++-- 5 files changed, 20 insertions(+), 40 deletions(-) diff --git a/include/mbt/Sentence.h b/include/mbt/Sentence.h index 65d4330..f4ec6ab 100644 --- a/include/mbt/Sentence.h +++ b/include/mbt/Sentence.h @@ -71,8 +71,12 @@ namespace Tagger { void clear(); bool init_windowing( const std::map&, UnicodeHash& ); - bool nextpat( MatchAction&, std::vector&, UnicodeHash& , UnicodeHash&, - unsigned int, int * = 0 ) const; + bool nextpat( MatchAction&, + std::vector&, + UnicodeHash&, + UnicodeHash&, + unsigned int, + const std::vector& ) const; int classify_hapax( const icu::UnicodeString&, UnicodeHash& ) const; void assign_tag( int, unsigned int ); icu::UnicodeString getword( unsigned int i ) const { diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index 1180664..c8a12aa 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -78,8 +78,8 @@ namespace Tagger { void Print( std::ostream& os, int i_word, Hash::UnicodeHash& TheLex ); void PrintBest( std::ostream& os, Hash::UnicodeHash& TheLex ); int size; - int **paths; - int **temppaths; + std::vector> paths; + std::vector> temppaths; std::vector path_prob; std::vectorn_best_array; private: diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index f5a3feb..325deae 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -259,9 +259,10 @@ namespace Tagger { // of the words in the dictionary and the values // of the features are stored in the testpattern int swcn = 0; + vector dummy(1,0); while( mySentence.nextpat( Action, TestPat, *kwordlist, TheLex, - swcn ) ){ + swcn, dummy ) ){ bool skip = false; if ( DoNpax && !do_known ){ if ( (uwordlist->lookup( mySentence.getword(swcn))) == 0 ){ diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 4d0810c..97378c6 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -66,21 +66,10 @@ namespace Tagger { class BeamData; - BeamData::BeamData(){ - size = 0; - paths = 0; - temppaths = 0; + BeamData::BeamData():size(0){ } BeamData::~BeamData(){ - if ( paths ){ - for ( int q=0; q < size; ++q ){ - delete [] paths[q]; - delete [] temppaths[q]; - } - } - delete [] paths; - delete [] temppaths; } bool BeamData::Init( int Size, unsigned int noWords ){ @@ -89,28 +78,12 @@ namespace Tagger { // the first time path_prob.resize(Size); n_best_array.resize(Size); - if ( (paths = new int*[Size]) == 0 || - (temppaths = new int*[Size]) == 0 ){ - throw runtime_error( "Beam: not enough memory for N-best search tables" ); - } - else { - for ( int q=0; q < Size; ++q ){ - paths[q] = 0; - temppaths[q] = 0; - } - } - } - else { - for ( int q=0; q < Size; ++q ){ - delete [] paths[q]; - delete [] temppaths[q]; - } + paths.resize(Size); + temppaths.resize(Size); } for ( int q=0; q < Size; ++q ){ - if ( (paths[q] = new int[noWords]) == 0 || - (temppaths[q] = new int[noWords]) == 0 ){ - throw runtime_error( "Beam: not enough memory for N-best search tables" ); - } + paths[q].resize(noWords,0); + temppaths[q].resize(noWords,0); } size = Size; return true; @@ -819,7 +792,8 @@ namespace Tagger { MatchAction Action = Unknown; vector TestPat; TestPat.reserve(Utemplate.totalslots()); - if ( mySentence.nextpat( Action, TestPat, *kwordlist, TheLex, 0 )){ + vector start; + if ( mySentence.nextpat( Action, TestPat, *kwordlist, TheLex, 0, start )){ DBG << "Start: " << mySentence.getword( 0 ) << endl; InitTest( mySentence, TestPat, Action ); for ( unsigned int iword=1; iword < mySentence.size(); ++iword ){ diff --git a/src/Sentence.cxx b/src/Sentence.cxx index 9fe63b8..973cbeb 100644 --- a/src/Sentence.cxx +++ b/src/Sentence.cxx @@ -233,7 +233,8 @@ namespace Tagger { bool sentence::nextpat( MatchAction& Action, vector& Pat, UnicodeHash& wordlist, UnicodeHash& TheLex, - unsigned int position, int *old_pat ) const { + unsigned int position, + const vector& old_pat ) const { Pat.clear(); // safety check: // @@ -340,7 +341,7 @@ namespace Tagger { // switch(aTemplate->templatestring[ii]){ case 'd': - if ( old_pat == 0 ){ + if ( old_pat[0] == 0 ){ Pat.push_back( wPtr->word_ass_tag ); } else { From bffdde2b905da21f9925c102b316033fb2b8dc84 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 18 Apr 2024 15:39:13 +0200 Subject: [PATCH 58/68] fixes for macos --- .github/workflows/mbt.yml | 9 ++++---- bootstrap.sh | 43 ++++++++++++++++++++++++++++++++++++--- m4/ac_osx_pkg.m4 | 18 +++++++++++++--- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 048c883..f2ec501 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -54,10 +54,11 @@ jobs: sudo apt-get install pkg-config autoconf-archive else brew upgrade; - brew install pkg-config; - brew install autoconf-archive; - brew install autoconf; - brew install automake; + brew install pkg-config + brew install libtool + brew install autoconf-archive + brew install autoconf + brew install automake fi - name: Install Dependencies run: | diff --git a/bootstrap.sh b/bootstrap.sh index 6112ee1..f25bc05 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # bootstrap - script to bootstrap the distribution rolling engine # usage: @@ -35,15 +37,50 @@ aclocal=aclocal # svn log --verbose > ChangeLog #} -if $automake --version|head -1 |grep ' 1\.[4-9]'; then - echo "automake 1.4-1.9 is active. You should use automake 1.10 or later" +# inspired by hack as used in mcl (from http://micans.org/) + +# autoconf-archive Debian package, aclocal-archive RPM, obsolete/badly supported OS, installed in home dir +acdirs="/usr/share/autoconf-archive/ /usr/share/aclocal/ /usr/local/share/aclocal/ $HOME/local/share/autoconf-archive/ /opt/homebrew/share/aclocal/" + + found=false + for d in $acdirs + do + if test -f ${d}pkg.m4 + then + found=true + break + fi + done + + if ! $found + then + cat < +# Copyright © 2024 Ko van der Sloot # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -40,6 +40,13 @@ case ${host_os} in export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" fi done + for i in `ls /opt/homewbrew/opt` + do + if test -d "/opt/homebrew/opt/$i/lib/pkgconfig" + then + export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/opt/homebrew/opt/$i/lib/pkgconfig" + fi + done ;; esac ]) @@ -55,12 +62,17 @@ case ${host_os} in # linux is wellbehaved ;; darwin*) - # darwin isn't + # darwin/macos isn't for i in $* do if test -d "/usr/local/opt/$i/lib/pkgconfig" then export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" + else + if test -d "/opt/homebrew/opt//$i/lib/pkgconfig" + then + export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/opt/homebrew/opt/$i/lib/pkgconfig" + fi fi done ;; From 125c4cb9b81010b6b15627025580b7555efa8284 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 18 Apr 2024 15:42:35 +0200 Subject: [PATCH 59/68] make sure to run on macos-14 not latest (for now) --- .github/workflows/mbt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index f2ec501..97c0616 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -38,7 +38,7 @@ jobs: needs: notification strategy: matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-14] compiler: [g++, clang++] steps: From a19c10a80a23e33d6f41ec696c37cfb1c039693f Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 2 May 2024 17:43:27 +0200 Subject: [PATCH 60/68] restrict CppCheck parameters --- .github/workflows/mbt.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 97c0616..0a61b74 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -71,6 +71,16 @@ jobs: brew install zlib brew install cppcheck fi + + - name: Configure CppCheck + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cpc_opts="--enable=warning,style --inline-suppr --force -I include -I /usr/local/include --quiet --error-exitcode=0" + else + cpc_opts="--enable=warning,style --inline-suppr --force -I include --check-level=exhaustive --quiet --error-exitcode=0" + fi + echo "cpc_opts=$cpc_opts" >> $GITHUB_ENV + - name: install TiccUtils env: CXX: ${{ matrix.compiler }} @@ -107,7 +117,7 @@ jobs: id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) echo "id=$id" >> $GITHUB_OUTPUT - name: Static Code-check - run: cppcheck --enable=all --suppress=missingIncludeSystem -I include -I /usr/local/include --quiet --error-exitcode=0 -I include . + run: cppcheck ${{ env.cpc_opts }} . - name: make run: make - name: install From dd299b11681babc9fdc44ddb65982d1aa9ee9672 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 2 May 2024 17:57:56 +0200 Subject: [PATCH 61/68] code quality --- include/mbt/Tagger.h | 4 ++-- src/GenerateTagger.cxx | 2 +- src/RunTagger.cxx | 15 +++++++-------- src/TagLex.cxx | 6 ++++-- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/mbt/Tagger.h b/include/mbt/Tagger.h index c8a12aa..dcd3e7f 100644 --- a/include/mbt/Tagger.h +++ b/include/mbt/Tagger.h @@ -65,7 +65,7 @@ namespace Tagger { public: BeamData(); ~BeamData(); - bool Init( int, unsigned int ); + void Init( int, unsigned int ); void InitPaths( Hash::UnicodeHash&, const Timbl::TargetValue *, const Timbl::ClassDistribution * ); @@ -96,7 +96,7 @@ namespace Tagger { ~TaggerClass(); bool InitTagging(); bool InitLearning(); - bool InitBeaming( unsigned int ); + void InitBeaming( unsigned int ); TaggerClass *clone() const; int Run( ); std::vector tagLine( const icu::UnicodeString& ); diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 325deae..8f1574f 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -119,7 +119,7 @@ namespace Tagger { out_file.good() ) ){ COUT << " Creating lexicon: " << LexFileName << " of " << TagVect.size() << " entries." << endl; - for ( auto const& tv : TagVect ){ + for ( auto const *tv : TagVect ){ out_file << tv->Freq() << " " << tv->Word << " " << tv->DisplayTagFreqs() << endl; } diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index 97378c6..fdb7bb0 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -72,7 +72,7 @@ namespace Tagger { BeamData::~BeamData(){ } - bool BeamData::Init( int Size, unsigned int noWords ){ + void BeamData::Init( int Size, unsigned int noWords ){ // Beaming Stuff... if ( path_prob.size() == 0 ){ // the first time @@ -86,7 +86,6 @@ namespace Tagger { temppaths[q].resize(noWords,0); } size = Size; - return true; } void BeamData::ClearBest(){ @@ -311,11 +310,11 @@ namespace Tagger { } } - bool TaggerClass::InitBeaming( unsigned int no_words ){ + void TaggerClass::InitBeaming( unsigned int no_words ){ if ( !Beam ){ Beam = new BeamData(); } - return Beam->Init( Beam_Size, no_words ); + Beam->Init( Beam_Size, no_words ); } int TaggerClass::ProcessLines( istream &is, ostream& os ){ @@ -780,11 +779,11 @@ namespace Tagger { vector TaggerClass::tagSentence( sentence& mySentence ){ vector result; + if ( !initialized ){ + throw runtime_error( "Tagger not initialized" ); + } if ( mySentence.size() != 0 ){ - if ( !initialized || - !InitBeaming( mySentence.size() ) ){ - throw runtime_error( "Tagger not initialized" ); - } + InitBeaming( mySentence.size() ); DBG << mySentence << endl; if ( mySentence.init_windowing( *MT_lexicon, TheLex ) ) { // here the word window is looked up in the dictionary and the values diff --git a/src/TagLex.cxx b/src/TagLex.cxx index 94649dc..1d6fe7a 100644 --- a/src/TagLex.cxx +++ b/src/TagLex.cxx @@ -45,7 +45,9 @@ namespace Tagger { TagInfo::TagInfo( const UnicodeString& word, const UnicodeString& tag ): - Word(word), WordFreq(0) { + Word(word), + WordFreq(0) + { Update( tag ); } @@ -143,7 +145,7 @@ namespace Tagger { } void StoreInVector( TagInfo *TI, void *arg ){ - vector *vec = (vector *)arg; + vector *vec = static_cast*>(arg); vec->push_back( TI ); } From 04aa5faaa9a0ee728d0faddf5a2d05f7489c9485 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 2 Sep 2024 17:18:15 +0200 Subject: [PATCH 62/68] force g++17 --- configure.ac | 13 +++---------- src/Makefile.am | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/configure.ac b/configure.ac index 3c413ce..d5beb50 100644 --- a/configure.ac +++ b/configure.ac @@ -9,19 +9,12 @@ AC_CONFIG_SRCDIR([.]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_HEADERS([config.h]) -if test x"${CXXFLAGS+set}" = xset; then - # the user set CXXFLAGS; don't override it. - cxx_flags_were_set=true -else - cxx_flags_were_set=false -fi - -if $cxx_flags_were_set; then - CXXFLAGS=$CXXFLAGS -fi +AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX_17]) # Checks for programs. AC_PROG_CXX( [g++] ) +AX_CXX_COMPILE_STDCXX_17 + LT_INIT # when running tests, use CXX diff --git a/src/Makefile.am b/src/Makefile.am index baeecb6..055bc23 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,5 @@ AM_CPPFLAGS = -I@top_srcdir@/include -AM_CXXFLAGS = -std=c++14 -W -Wall -g -pedantic -O3 +AM_CXXFLAGS = -std=c++17 -W -Wall -g -pedantic -O3 LDADD = libmbt.la From 6f655dc583a6eb540abffd82b6364bc4069064c2 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 12 Sep 2024 15:50:08 +0200 Subject: [PATCH 63/68] modernized GitHub CI action --- .github/workflows/mbt.yml | 146 ++++++++++---------------------------- 1 file changed, 37 insertions(+), 109 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 0a61b74..714ed43 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -1,12 +1,15 @@ +--- name: C/C++ CI on: + schedule: + - cron: "0 19 3 * 5" # run test once a month push: branches: [master] paths: + - configure.ac - 'src/**' - 'include/**' - - '.github/**' - '.github/workflows/*' pull_request: @@ -15,133 +18,58 @@ on: jobs: notification: runs-on: ubuntu-latest - name: Notify start to gitlama - outputs: - branch: ${{ steps.extract_branch.outputs.branch }} + name: Notifications steps: - - name: Extract Branch name - id: extract_branch - shell: bash - run: echo "branch=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT - - name: IRC notification - uses: Gottox/irc-message-action@v2 - with: - server: irc.uvt.nl - channel: '#gitlama' - nickname: GitHub - message: > - ${{ github.actor }} started a build of Mbt - [${{ steps.extract_branch.outputs.branch }}] + - name: IRC notification of starting the builds + uses: LanguageMachines/ticcactions/irc-init@v1 build: runs-on: ${{ matrix.os }} needs: notification strategy: matrix: - os: [ubuntu-latest, macos-14] - compiler: [g++, clang++] + os: [ubuntu-latest, macos-latest] + compiler: [g++-12, clang++] steps: + - name: exclude Mac-OS with g++ + shell: bash + if: ${{ matrix.os == 'macos-latest' && matrix.compiler == 'g++-12' }} + run: | + echo "action_status=skip" >> $GITHUB_ENV + echo "action_details='c++ library issues'" >> $GITHUB_ENV + - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.12.1 with: access_token: ${{ github.token }} - uses: actions/checkout@v4.1.1 - - name: Install Build Environment - run: | - if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt-get install pkg-config autoconf-archive - else - brew upgrade; - brew install pkg-config - brew install libtool - brew install autoconf-archive - brew install autoconf - brew install automake - fi - - name: Install Dependencies - run: | - if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt-get install libicu-dev libxml2-dev libbz2-dev - sudo apt-get install zlib1g-dev cppcheck - else - brew install libxml2 - brew install bzip2 - brew install zlib - brew install cppcheck - fi - - name: Configure CppCheck - run: | - if [ "$RUNNER_OS" == "Linux" ]; then - cpc_opts="--enable=warning,style --inline-suppr --force -I include -I /usr/local/include --quiet --error-exitcode=0" - else - cpc_opts="--enable=warning,style --inline-suppr --force -I include --check-level=exhaustive --quiet --error-exitcode=0" - fi - echo "cpc_opts=$cpc_opts" >> $GITHUB_ENV + - uses: LanguageMachines/ticcactions/cpp-build-env@v1 + - uses: LanguageMachines/ticcactions/cpp-dependencies@v1 + - uses: LanguageMachines/ticcactions/irc-nick@v1 - - name: install TiccUtils - env: - CXX: ${{ matrix.compiler }} - run: | - git clone https://github.com/LanguageMachines/ticcutils; - cd ticcutils; - bash bootstrap.sh; - ./configure; - make; - sudo make install; - cd ..; + - uses: LanguageMachines/ticcactions/cpp-submodule-build@v1 + with: + module: ticcutils - - name: install TiMBL - env: - CXX: ${{ matrix.compiler }} - run: | - git clone https://github.com/LanguageMachines/timbl; - cd timbl; - bash bootstrap.sh; - ./configure; - make; - sudo make install; - cd ..; + - uses: LanguageMachines/ticcactions/cpp-submodule-build@v1 + with: + module: timbl - - name: bootstrap - run: sh bootstrap.sh - - name: configure - env: - CXX: ${{ matrix.compiler }} - run: ./configure - - name: compiler-id - id: compiler - run: | - id=$(echo ${{matrix.compiler}} | cut -d\+ -f1) - echo "id=$id" >> $GITHUB_OUTPUT + - uses: LanguageMachines/ticcactions/setup-cppcheck@v1 - name: Static Code-check + if: ${{ env.action_status == '' }} run: cppcheck ${{ env.cpc_opts }} . - - name: make - run: make - - name: install - run: sudo make install - - name: make check - env: - CXX: ${{ matrix.compiler }} - run: LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib make check - continue-on-error: true - - name: show log - run: cat src/test-suite.log - - name: Notify IRC of failure - if: ${{ failure() }} - uses: Gottox/irc-message-action@v2 - with: - server: irc.uvt.nl - channel: '#gitlama' - nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} - message: "mbt [${{ needs.notification.outputs.branch }}] build with ${{ matrix.compiler }} by ${{ github.actor }} on ${{ matrix.os }}: \u00034FAIL\u0003" - - name: Notify IRC of succes - if: ${{ success() }} - uses: Gottox/irc-message-action@v2 + + - uses: LanguageMachines/ticcactions/cpp-safe-build@v1 + + - name: Notify IRC of results + uses: LanguageMachines/ticcactions/irc-status@v1 with: - server: irc.uvt.nl - channel: '#gitlama' - nickname: GH-${{ runner.os }}-${{ steps.compiler.outputs.id }} - message: "mbt [${{ needs.notification.outputs.branch }}] build with ${{ matrix.compiler }} by ${{ github.actor }} on ${{ matrix.os }}: \u00033SUCCESS\u0003" + branch: ${{ github.ref_name }} + nickname: ${{ env.nick }} + step: test + status: ${{ env.action_status }} + details: ${{ env.action_details }} From 167a7395b13f9540de5808076d1921d9150d51c1 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Tue, 24 Sep 2024 17:08:05 +0200 Subject: [PATCH 64/68] no longer exclude MaxOS/gcc from build --- .github/workflows/mbt.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 714ed43..7e72bc8 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -32,13 +32,6 @@ jobs: compiler: [g++-12, clang++] steps: - - name: exclude Mac-OS with g++ - shell: bash - if: ${{ matrix.os == 'macos-latest' && matrix.compiler == 'g++-12' }} - run: | - echo "action_status=skip" >> $GITHUB_ENV - echo "action_details='c++ library issues'" >> $GITHUB_ENV - - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.12.1 with: From 2d5c9e8e9cd53478172ff54fc5bb7d2329e32921 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 7 Nov 2024 14:58:33 +0100 Subject: [PATCH 65/68] modernizing a bit --- src/GenerateTagger.cxx | 2 +- src/convert.cxx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index 8f1574f..b8bd01a 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -80,7 +80,7 @@ namespace Tagger { template struct more_second { - typedef pair type; + using type = pair; bool operator ()(type const& a, type const& b) const { return a.second > b.second; } diff --git a/src/convert.cxx b/src/convert.cxx index 8f67feb..aed5244 100644 --- a/src/convert.cxx +++ b/src/convert.cxx @@ -42,7 +42,7 @@ using std::ws; using std::map; using std::string; -typedef map string_map; +using string_map = map; void fill_map( istream& in, string_map& map ){ string tag, word; From 32610f939f541d0feda278e9580cd8534231f3d3 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Thu, 5 Dec 2024 10:53:32 +0100 Subject: [PATCH 66/68] Squashed commit of the following: modernizing, updated to latest ticcutils --- .github/workflows/mbt.yml | 6 +++++- configure.ac | 2 +- src/GenerateTagger.cxx | 24 ++++++++++++------------ src/MbtAPI.cxx | 4 ++-- src/RunTagger.cxx | 10 +++++----- src/Tagger.cxx | 11 ++++++----- src/convert.cxx | 2 +- 7 files changed, 32 insertions(+), 27 deletions(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 7e72bc8..9149ee4 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -5,7 +5,9 @@ on: schedule: - cron: "0 19 3 * 5" # run test once a month push: - branches: [master] + branches: + - master + - develop paths: - configure.ac - 'src/**' @@ -45,10 +47,12 @@ jobs: - uses: LanguageMachines/ticcactions/cpp-submodule-build@v1 with: + branch: ${{ github.ref_name }} module: ticcutils - uses: LanguageMachines/ticcactions/cpp-submodule-build@v1 with: + branch: ${{ github.ref_name }} module: timbl - uses: LanguageMachines/ticcactions/setup-cppcheck@v1 diff --git a/configure.ac b/configure.ac index d5beb50..72c7916 100644 --- a/configure.ac +++ b/configure.ac @@ -60,7 +60,7 @@ PKG_CHECK_MODULES([timbl], [timbl >= 6.8] ) CXXFLAGS="$CXXFLAGS $timbl_CFLAGS" LIBS="$LIBS $timbl_LIBS" -PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.34] ) +PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.36] ) CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS" LIBS="$LIBS $ticcutils_LIBS" diff --git a/src/GenerateTagger.cxx b/src/GenerateTagger.cxx index b8bd01a..d63ff80 100644 --- a/src/GenerateTagger.cxx +++ b/src/GenerateTagger.cxx @@ -80,8 +80,8 @@ namespace Tagger { template struct more_second { - using type = pair; - bool operator ()(type const& a, type const& b) const { + using c_type = pair; + bool operator ()(c_type const& a, c_type const& b) const { return a.second > b.second; } }; @@ -531,24 +531,24 @@ namespace Tagger { } if ( opts.extract( 'D', value ) ){ if ( value == "LogSilent" ){ - cur_log->setlevel( LogSilent ); - default_cout.setlevel( LogSilent ); + cur_log->set_level( LogSilent ); + default_cout.set_level( LogSilent ); } else if ( value == "LogNormal" ){ - cur_log->setlevel( LogNormal ); - default_cout.setlevel( LogNormal ); + cur_log->set_level( LogNormal ); + default_cout.set_level( LogNormal ); } else if ( value == "LogDebug" ){ - cur_log->setlevel( LogDebug ); - default_cout.setlevel( LogDebug ); + cur_log->set_level( LogDebug ); + default_cout.set_level( LogDebug ); } else if ( value == "LogHeavy" ){ - cur_log->setlevel( LogHeavy ); - default_cout.setlevel( LogHeavy ); + cur_log->set_level( LogHeavy ); + default_cout.set_level( LogHeavy ); } else if ( value == "LogExtreme" ){ - cur_log->setlevel( LogExtreme ); - default_cout.setlevel( LogExtreme ); + cur_log->set_level( LogExtreme ); + default_cout.set_level( LogExtreme ); } else { cerr << "Unknown Debug mode! (-D " << value << ")" << endl; diff --git a/src/MbtAPI.cxx b/src/MbtAPI.cxx index 9a31066..5e6c2dd 100644 --- a/src/MbtAPI.cxx +++ b/src/MbtAPI.cxx @@ -139,7 +139,7 @@ bool MbtAPI::GenerateTagger(int argc, char *argv[]) { // generate a tagger using argv. // Independent, static function so, don't use the internal _tagger here // - cur_log->setlevel( Tagger_Log_Level ); + cur_log->set_level( Tagger_Log_Level ); time_t timebefore, timeafter, timediff; time(&timebefore); int nw = TaggerClass::CreateTagger( argc, argv ); @@ -164,7 +164,7 @@ bool MbtAPI::GenerateTagger( const std::string& arg ) { // generate a tagger using a string. // Independent, static function so, don't use the internal _tagger here // - cur_log->setlevel( Tagger_Log_Level ); + cur_log->set_level( Tagger_Log_Level ); time_t timebefore, timeafter, timediff; time(&timebefore); int nw = TaggerClass::CreateTagger( arg ); diff --git a/src/RunTagger.cxx b/src/RunTagger.cxx index fdb7bb0..e9f9cf4 100644 --- a/src/RunTagger.cxx +++ b/src/RunTagger.cxx @@ -1170,19 +1170,19 @@ namespace Tagger { } if ( Opts.extract( 'D', value ) ){ if ( value == "LogSilent" ) { - cur_log->setlevel( LogSilent ); + cur_log->set_level( LogSilent ); } else if ( value == "LogNormal" ){ - cur_log->setlevel( LogNormal ); + cur_log->set_level( LogNormal ); } else if ( value == "LogDebug" ){ - cur_log->setlevel( LogDebug ); + cur_log->set_level( LogDebug ); } else if ( value == "LogHeavy" ){ - cur_log->setlevel( LogHeavy ); + cur_log->set_level( LogHeavy ); } else if ( value == "LogExtreme" ){ - cur_log->setlevel( LogExtreme ); + cur_log->set_level( LogExtreme ); } else { cerr << "Unknown Debug mode! (-D " << value << ")" << endl; diff --git a/src/Tagger.cxx b/src/Tagger.cxx index 5acbc25..30ff6cc 100644 --- a/src/Tagger.cxx +++ b/src/Tagger.cxx @@ -55,8 +55,8 @@ using namespace std; using namespace icu; LogStream default_log( cerr ); -LogStream default_cout( cout, "", NoStamp); LogStream *cur_log = &default_log; // fill the externals +LogStream default_cout( cout ); LogLevel internal_default_level = LogNormal; LogLevel Tagger_Log_Level = internal_default_level; @@ -78,9 +78,9 @@ namespace Tagger { TaggerClass::TaggerClass(): cur_log(new LogStream( cerr )) { - cur_log->setlevel( LogNormal ); - cur_log->setstamp( StampMessage ); - default_cout.setstamp( NoStamp ); + cur_log->set_level( LogNormal ); + cur_log->set_stamp( StampMessage ); + default_cout.set_stamp( NoStamp ); KnownTree = NULL; unKnownTree = NULL; TimblOptStr = "+vS -FColumns K: -a IGTREE +D U: -a IB1 "; @@ -186,7 +186,8 @@ namespace Tagger { if ( !cloned ){ delete cur_log; } - cur_log = new LogStream( os, "mbt-" ); + cur_log = new LogStream( &os ); + cur_log->add_message( "mbt-" ); return true; } diff --git a/src/convert.cxx b/src/convert.cxx index aed5244..8f67feb 100644 --- a/src/convert.cxx +++ b/src/convert.cxx @@ -42,7 +42,7 @@ using std::ws; using std::map; using std::string; -using string_map = map; +typedef map string_map; void fill_map( istream& in, string_map& map ){ string tag, word; From 9807fa18ebd92a79855ecd682413bb889181e777 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 16 Dec 2024 11:08:17 +0100 Subject: [PATCH 67/68] NEWS --- NEWS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS b/NEWS index 9441a7f..571b752 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +mbt version 3.11 2024-12-16 +[Ko van der Sloot] +* updated to latest ticcutils +* C++17 is required +* some code improvements (more C++, less pointers) +* updated GitHub CI + mbt version 3.10 2023-10-21 [Ko van der Sloot] * removed dependency on libtar From 9520489af37a71881f1e4a7d4b65bd6b60e79b99 Mon Sep 17 00:00:00 2001 From: Ko van der Sloot Date: Mon, 23 Jun 2025 10:22:37 +0200 Subject: [PATCH 68/68] fix CI --- .github/workflows/mbt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mbt.yml b/.github/workflows/mbt.yml index 9149ee4..34babf8 100644 --- a/.github/workflows/mbt.yml +++ b/.github/workflows/mbt.yml @@ -31,7 +31,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - compiler: [g++-12, clang++] + compiler: [g++-12, clang++ -std=c++17] steps: - name: Cancel Previous Runs