Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
* Added `seqan3::views::char_strictly_to`. Behaves like `seqan3::views::char_to`, but throws on invalid
input ([\#2898](https://github.com/seqan/seqan3/pull/2898)).
* Improved performance of vector assignment for alphabets ([\#3038](https://github.com/seqan/seqan3/pull/3038)).
* Improved performance of `seqan3::dna4::complement()` ([\#3026](https://github.com/seqan/seqan3/pull/3026)).

#### I/O
* Added `seqan3::sequence_file_option::fasta_ignore_blanks_before_id` to ignore blanks before IDs when reading FASTA
Expand Down
11 changes: 11 additions & 0 deletions doc/cookbook/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,14 @@ This will use `4` threads by default and can be adjusted by setting `seqan3::con
the desired value:

\snippet doc/cookbook/compression_threads.cpp example

# Auto vectorized dna4 complement

Our alphabet seqan3::dna4 cannot be easily auto-vectorized by the compiler.

See [this discussion](https://github.com/seqan/seqan3/issues/1970) for more details.

You can add your own alphabet that is auto-vectorizable in some use cases.
Here is an example for a dna4-like alphabet:

\snippet test/performance/simd_dna4.hpp cookbook
9 changes: 9 additions & 0 deletions doc/cookbook/simd_dna4.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Checks that defining simd_dna4 works without putting it into the seqan3 namespace.

#define SEQAN3_USE_NAMESPACE 0
#include <seqan3/test/performance/simd_dna4.hpp>

int main()
{
simd_dna4 [[maybe_unused]] letter{};
}
6 changes: 6 additions & 0 deletions include/seqan3/alphabet/nucleotide/dna4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ class dna4 : public nucleotide_base<dna4, 4>
}
//!\}

//!\brief Returns the complement of the current nucleotide.
constexpr dna4 complement() const noexcept
{
return dna4{}.assign_rank(to_rank() ^ 0b11);
}

private:
/*!\brief The lookup table used in #rank_to_char.
* \details
Expand Down
102 changes: 102 additions & 0 deletions test/include/seqan3/test/performance/simd_dna4.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

#pragma once

#include <seqan3/alphabet/nucleotide/nucleotide_base.hpp>
#include <seqan3/alphabet/nucleotide/rna4.hpp>

// Conditionally add the seqan3 namespace:
// unit/performance tests: in seqan3 namespace
// cookbook entry: no namespace
#ifndef SEQAN3_USE_NAMESPACE
# define SEQAN3_USE_NAMESPACE 1
#endif

#if SEQAN3_USE_NAMESPACE
namespace seqan3
{
#endif
/* See discussion here: https://github.com/seqan/seqan3/issues/1970
*
* If AVX2 is available, this significantly improves the performance
* (roughly 5 times faster). But if no AVX2 is available. this decreases the
* performance (roughly 3 times slower).
*/
//![cookbook]
class simd_dna4 : public seqan3::nucleotide_base<simd_dna4, 256>
{
private:
using base_t = seqan3::nucleotide_base<simd_dna4, 256>;

friend base_t; // nucleotide_base
friend base_t::base_t; // alphabet_base
friend seqan3::rna4;

public:
constexpr simd_dna4() noexcept = default;
constexpr simd_dna4(simd_dna4 const &) noexcept = default;
constexpr simd_dna4(simd_dna4 &&) noexcept = default;
constexpr simd_dna4 & operator=(simd_dna4 const &) noexcept = default;
constexpr simd_dna4 & operator=(simd_dna4 &&) noexcept = default;
~simd_dna4() noexcept = default;

template <std::same_as<seqan3::rna4> t> // template parameter t to accept incomplete type
constexpr simd_dna4(t const r) noexcept
{
assign_rank(r.to_rank());
}

using base_t::assign_rank;
using base_t::base_t;
using base_t::to_rank;

static constexpr uint8_t alphabet_size = 4;

constexpr simd_dna4 & assign_char(char_type const c) noexcept
{
char_type const upper_case_char = c & 0b0101'1111;
rank_type rank = (upper_case_char == 'T') * 3 + (upper_case_char == 'G') * 2 + (upper_case_char == 'C');
return assign_rank(rank);
}

constexpr char_type to_char() const noexcept
{
rank_type const rank = to_rank();
switch (rank)
{
case 0u:
return 'A';
case 1u:
return 'C';
case 2u:
return 'G';
default:
return 'T';
}
}

constexpr simd_dna4 complement() const noexcept
{
rank_type rank{to_rank()};
rank ^= 0b11;
simd_dna4 ret{};
return ret.assign_rank(rank);
}

static constexpr bool char_is_valid(char_type const c) noexcept
{
char_type const upper_case_char = c & 0b0101'1111;
return (upper_case_char == 'A') || (upper_case_char == 'C') || (upper_case_char == 'G')
|| (upper_case_char == 'T');
}
};
//![cookbook]

#if SEQAN3_USE_NAMESPACE
}
#endif
1 change: 1 addition & 0 deletions test/performance/alphabet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ add_subdirectories ()

seqan3_benchmark (alphabet_assign_char_benchmark.cpp)
seqan3_benchmark (alphabet_assign_rank_benchmark.cpp)
seqan3_benchmark (alphabet_complement_benchmark.cpp)
seqan3_benchmark (alphabet_to_char_benchmark.cpp)
seqan3_benchmark (alphabet_to_rank_benchmark.cpp)
2 changes: 2 additions & 0 deletions test/performance/alphabet/alphabet_assign_char_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <numeric>

#include <seqan3/alphabet/all.hpp>
#include <seqan3/test/performance/simd_dna4.hpp>
#include <seqan3/test/seqan2.hpp>

#if SEQAN3_HAS_SEQAN2
Expand All @@ -38,6 +39,7 @@ void assign_char(benchmark::State & state)
BENCHMARK_TEMPLATE(assign_char, seqan3::gap);
BENCHMARK_TEMPLATE(assign_char, seqan3::dna4);
BENCHMARK_TEMPLATE(assign_char, seqan3::rna4);
BENCHMARK_TEMPLATE(assign_char, seqan3::simd_dna4);
BENCHMARK_TEMPLATE(assign_char, seqan3::dna5);
BENCHMARK_TEMPLATE(assign_char, seqan3::rna5);
BENCHMARK_TEMPLATE(assign_char, seqan3::dna15);
Expand Down
2 changes: 2 additions & 0 deletions test/performance/alphabet/alphabet_assign_rank_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstring>

#include <seqan3/alphabet/all.hpp>
#include <seqan3/test/performance/simd_dna4.hpp>
#include <seqan3/test/seqan2.hpp>

#if SEQAN3_HAS_SEQAN2
Expand Down Expand Up @@ -45,6 +46,7 @@ void assign_rank(benchmark::State & state)
BENCHMARK_TEMPLATE(assign_rank, seqan3::gap);
BENCHMARK_TEMPLATE(assign_rank, seqan3::dna4);
BENCHMARK_TEMPLATE(assign_rank, seqan3::rna4);
BENCHMARK_TEMPLATE(assign_rank, seqan3::simd_dna4);
BENCHMARK_TEMPLATE(assign_rank, seqan3::dna5);
BENCHMARK_TEMPLATE(assign_rank, seqan3::rna5);
BENCHMARK_TEMPLATE(assign_rank, seqan3::dna15);
Expand Down
160 changes: 160 additions & 0 deletions test/performance/alphabet/alphabet_complement_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

/* Copied and adjusted from https://raw.githubusercontent.com/kloetzl/libdna/master/bench2/revcomp.cxx
* Credits go to Fabian Klötzl (@kloetzl - https://github.com/kloetzl)
*/

#include <benchmark/benchmark.h>

#include <random>

#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/alphabet/views/char_to.hpp>
#include <seqan3/alphabet/views/complement.hpp>
#include <seqan3/test/performance/simd_dna4.hpp>

enum class tag
{
revcomp_dna4_inline,
seqan3_dna4,
seqan3_dna4_vector,
seqan3_dna4_simd,
seqan3_dna4_simd_vector
};

class allocator
{
public:
allocator() = delete;
allocator(allocator const &) = delete;
allocator & operator=(allocator const &) = delete;
allocator(allocator &&) = delete;
allocator & operator=(allocator &&) = delete;
~allocator()
{
std::free(forward);
std::free(reverse);
}

allocator(size_t const length)
{
forward = static_cast<char *>(std::malloc(length + 1));
reverse = static_cast<char *>(std::malloc(length + 1));
}

auto get() const
{
return std::make_tuple(forward, reverse);
}

private:
char * forward{nullptr};
char * reverse{nullptr};
};

static void generate_random_dna4_char_string(char * dest, size_t const length)
{
constexpr std::array<char, 8> dna4_chars{'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'};
std::mt19937_64 random_engine{1729u};
std::uniform_int_distribution<size_t> random_index(0u, dna4_chars.size() - 1u);
auto random_dna4_char = [&dna4_chars, &random_engine, &random_index]()
{
return dna4_chars[random_index(random_engine)];
};

std::ranges::generate_n(dest, length, random_dna4_char);
dest[length - 1u] = '\0';
}

static constexpr char * revcomp_dna4_inline(char const * const begin, size_t const length, char * const dest)
{
for (size_t i = 0; i < length; ++i)
{
char c = begin[length - 1 - i];

dest[i] = c ^= c & 2 ? 4 : 21;
}

return dest + length;
}

static constexpr auto seqan3_dna4(std::string_view sv)
{
return sv | seqan3::views::char_to<seqan3::dna4> | std::views::reverse | seqan3::views::complement;
}

static void seqan3_dna4_vector(std::string_view sv, std::vector<seqan3::dna4> & dest)
{
auto revcomp = sv | seqan3::views::char_to<seqan3::dna4> | std::views::reverse | seqan3::views::complement;
std::ranges::copy(revcomp, dest.begin());
}

static constexpr auto seqan3_dna4_simd(std::string_view sv)
{
return sv | seqan3::views::char_to<seqan3::simd_dna4> | std::views::reverse | seqan3::views::complement;
}

static void seqan3_dna4_simd_vector(std::string_view sv, std::vector<seqan3::simd_dna4> & dest)
{
auto revcomp = sv | seqan3::views::char_to<seqan3::simd_dna4> | std::views::reverse | seqan3::views::complement;
std::ranges::copy(revcomp, dest.begin());
}

template <tag id>
void complement(benchmark::State & state)
{
constexpr size_t length{1000003};
allocator alloc{length};
auto [forward, revcomp] = alloc.get();
generate_random_dna4_char_string(forward, length);

auto sv = std::string_view{forward};
using alphabet_t = std::conditional_t<id == tag::seqan3_dna4_vector, seqan3::dna4, seqan3::simd_dna4>;
std::vector<alphabet_t> vector(length);

for (auto _ : state)
{
if constexpr (id == tag::revcomp_dna4_inline)
{
revcomp_dna4_inline(forward, length, revcomp);
benchmark::DoNotOptimize(revcomp);
}
else if constexpr (id == tag::seqan3_dna4)
{
auto view = seqan3_dna4(sv);
for (auto elem : view)
benchmark::DoNotOptimize(elem);
}
else if constexpr (id == tag::seqan3_dna4_vector)
{
seqan3_dna4_vector(sv, vector);
benchmark::DoNotOptimize(vector);
}
else if constexpr (id == tag::seqan3_dna4_simd)
{
auto view = seqan3_dna4_simd(sv);
for (auto elem : view)
benchmark::DoNotOptimize(elem);
}
else if constexpr (id == tag::seqan3_dna4_simd_vector)
{
seqan3_dna4_simd_vector(sv, vector);
benchmark::DoNotOptimize(vector);
}
else
{
throw std::logic_error{"Invalid tag"};
}
}
}

BENCHMARK_TEMPLATE(complement, tag::revcomp_dna4_inline);
BENCHMARK_TEMPLATE(complement, tag::seqan3_dna4);
BENCHMARK_TEMPLATE(complement, tag::seqan3_dna4_vector);
BENCHMARK_TEMPLATE(complement, tag::seqan3_dna4_simd);
BENCHMARK_TEMPLATE(complement, tag::seqan3_dna4_simd_vector);
2 changes: 2 additions & 0 deletions test/performance/alphabet/alphabet_to_char_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstring>

#include <seqan3/alphabet/all.hpp>
#include <seqan3/test/performance/simd_dna4.hpp>
#include <seqan3/test/seqan2.hpp>

#if SEQAN3_HAS_SEQAN2
Expand Down Expand Up @@ -58,6 +59,7 @@ void to_char(benchmark::State & state)
BENCHMARK_TEMPLATE(to_char, seqan3::gap);
BENCHMARK_TEMPLATE(to_char, seqan3::dna4);
BENCHMARK_TEMPLATE(to_char, seqan3::rna4);
BENCHMARK_TEMPLATE(to_char, seqan3::simd_dna4);
BENCHMARK_TEMPLATE(to_char, seqan3::dna5);
BENCHMARK_TEMPLATE(to_char, seqan3::rna5);
BENCHMARK_TEMPLATE(to_char, seqan3::dna15);
Expand Down
2 changes: 2 additions & 0 deletions test/performance/alphabet/alphabet_to_rank_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <algorithm>

#include <seqan3/alphabet/all.hpp>
#include <seqan3/test/performance/simd_dna4.hpp>
#include <seqan3/test/seqan2.hpp>

#if SEQAN3_HAS_SEQAN2
Expand Down Expand Up @@ -52,6 +53,7 @@ void to_rank(benchmark::State & state)
BENCHMARK_TEMPLATE(to_rank, seqan3::gap);
BENCHMARK_TEMPLATE(to_rank, seqan3::dna4);
BENCHMARK_TEMPLATE(to_rank, seqan3::rna4);
BENCHMARK_TEMPLATE(to_rank, seqan3::simd_dna4);
BENCHMARK_TEMPLATE(to_rank, seqan3::dna5);
BENCHMARK_TEMPLATE(to_rank, seqan3::rna5);
BENCHMARK_TEMPLATE(to_rank, seqan3::dna15);
Expand Down
1 change: 1 addition & 0 deletions test/unit/alphabet/nucleotide/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ seqan3_test (rna4_test.cpp)
seqan3_test (rna5_test.cpp)
seqan3_test (rna15_test.cpp)
seqan3_test (nucleotide_conversion_integration_test.cpp)
seqan3_test (simd_dna4_test.cpp)
Loading