/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_VC_
#define VC_VC_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_VECTOR_H_
#define VC_VECTOR_H_

// 1. define all of Vc::Scalar - this one is always present, so it makes sense to put it first
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_VECTOR_H_
#define VC_SCALAR_VECTOR_H_

#include <assert.h>
#include <algorithm>
#include <cmath>

#ifdef _MSC_VER
#include <float.h>
#endif

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

/*  This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_TYPES_H_
#define VC_COMMON_TYPES_H_

#ifdef Vc_CHECK_ALIGNMENT
#include <cstdlib>
#include <cstdio>
#endif

#include <Vc/global.h>
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_TYPE_TRAITS_H_
#define VC_TRAITS_TYPE_TRAITS_H_

#include <type_traits>
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_DECAY_H_
#define VC_TRAITS_DECAY_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename T> using decay = typename std::decay<T>::type;
}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_DECAY_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
#define VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_

#include <array>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{

/**
 * Implements the has_no_allocated_data trait.
 *
 * Specialize this type for your container class if you need to make it usable with SIMD
 * gathers/scatters. Example:
 * \code
 * namespace Vc
 * {
 * namespace Traits
 * {
 * template<typename T> struct has_no_allocated_data_impl<MyContainer<T>> : public std::true_type {};
 * }
 * }
 * \endcode
 *
 * \see has_no_allocated_data
 */
template<typename T> struct has_no_allocated_data_impl : public std::false_type {};

/**
 * Type trait that tells whether a container stores its data inside the object or inside allocated
 * memory outside of the object.
 *
 * Per default the trait assumes any type to store its data outside, on the heap. The only types
 * where it knows that the storage is inside the object are std::array, Vc::array, and T[] (builtin
 * arrays).
 *
 * The trait forwards the actual decision to has_no_allocated_data_impl, but removes const/volatile
 * and references from the type \p T to make the number of required specializations of
 * has_no_allocated_data_impl minimal.
 */
template <typename T>
struct has_no_allocated_data
    : public has_no_allocated_data_impl<
          typename std::remove_cv<typename std::remove_reference<T>::type>::type>
{
};

// spezializations:
template<typename T, std::size_t N> struct has_no_allocated_data_impl<std::array<T, N>> : public std::true_type {};
template<typename T, std::size_t N> struct has_no_allocated_data_impl<T[N]> : public std::true_type {};
template<typename T> struct has_no_allocated_data_impl<T[]> : public std::true_type {};

// tests:
static_assert(has_no_allocated_data<int[256]>::value, "");
static_assert(has_no_allocated_data<const int[256]>::value, "");
static_assert(has_no_allocated_data<volatile int[256]>::value, "");
static_assert(has_no_allocated_data<const volatile int[256]>::value, "");

static_assert(has_no_allocated_data<int[]>::value, "");
static_assert(has_no_allocated_data<int[2][2]>::value, "");

static_assert(has_no_allocated_data<const volatile std::array<int, 256> &>::value, "");
static_assert(has_no_allocated_data<const volatile std::array<int, 256>>::value, "");
static_assert(has_no_allocated_data<volatile std::array<int, 256> &>::value, "");
static_assert(has_no_allocated_data<volatile std::array<int, 256>>::value, "");
static_assert(has_no_allocated_data<const std::array<int, 256> &>::value, "");
static_assert(has_no_allocated_data<const std::array<int, 256>>::value, "");
static_assert(has_no_allocated_data<std::array<int, 256>>::value, "");
static_assert(has_no_allocated_data<std::array<int, 256> &&>::value, "");
static_assert(!has_no_allocated_data<int*>::value, "");
static_assert(!has_no_allocated_data<const int*>::value, "");
static_assert(!has_no_allocated_data<const int *const>::value, "");
static_assert(!has_no_allocated_data<int *const>::value, "");

}  // namespace Traits
}  // namespace Vc

#endif // VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2016 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
#define VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_

#include <initializer_list>
#include <memory>

#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std
{
#endif
#ifdef _WIN32
template <typename T, size_t N> class array;
#else
template <typename T, size_t N> struct array;
#endif
template <typename T, typename Allocator> class vector;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
}  // namespace std
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace has_contiguous_storage_detail
{
template <typename T, typename It = typename T::iterator>
std::is_base_of<std::random_access_iterator_tag,
                typename It::iterator_category>
test(int);  // this is only a heuristic. Having a RandomAccessIterator does not guarantee
            // contiguous storage
template <typename T>
std::is_base_of<std::random_access_iterator_tag,
                typename T::iterator_category>
test(long);  // this is only a heuristic. Having a RandomAccessIterator does not guarantee
             // contiguous storage
template <typename T> std::false_type test(...);
}  // namespace has_contiguous_storage_detail

template <typename T>
struct has_contiguous_storage_impl
    : public decltype(has_contiguous_storage_detail::test<T>(int())) {
};

template <typename T>
struct has_contiguous_storage
    : public has_contiguous_storage_impl<
          typename std::remove_cv<typename std::remove_reference<T>::type>::type>
{
};

// spezializations:
template <typename T> struct has_contiguous_storage_impl<const T *> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<T *> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<std::unique_ptr<T[]>> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<std::initializer_list<T>> : public std::true_type {};
template <typename T, std::size_t N> struct has_contiguous_storage_impl<T[N]> : public std::true_type {};
template <typename T, std::size_t N> struct has_contiguous_storage_impl<std::array<T, N>> : public std::true_type {};
template <typename T, typename A> struct has_contiguous_storage_impl<std::vector<T, A>> : public std::true_type {};

}  // namespace Traits
}  // namespace Vc

#endif // VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_IS_INITIALIZER_LIST_H_
#define VC_TRAITS_IS_INITIALIZER_LIST_H_

#include <initializer_list>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace is_initializer_list_impl
{
template <typename... Args> struct test : public std::false_type {};
template <typename I> struct test<std::initializer_list<I>> : public std::true_type {};
}  // namespace is_initializer_list_impl

/**
 * \internal
 *
 * Trait that tests whether \p Args is a single type and of std::initializer_list.
 */
template <typename... Args>
struct is_initializer_list
    : public is_initializer_list_impl::test<decay<Args>...>
{
};
}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_IS_INITIALIZER_LIST_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_IS_LOAD_ARGUMENTS_H_
#define VC_TRAITS_IS_LOAD_ARGUMENTS_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename T> struct is_load_store_flag;

template <typename... Args> struct is_load_arguments : public std::false_type {};
template <typename U> struct is_load_arguments<U> : public std::is_pointer<U> {};
template <typename U, typename F>
struct is_load_arguments<U, F> : public std::integral_constant<
                                     bool,
                                     std::is_pointer<U>::value&& is_load_store_flag<F>::value>
{
};
}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_IS_LOAD_ARGUMENTS_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2016 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
#define VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace is_functor_argument_immutable_impl
{
template <typename F, typename A> std::true_type   test(void (F::*)(A));
template <typename F, typename A> std::true_type   test(void (F::*)(A) const);
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &));
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &) const);
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&));
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&) const);

struct dummy {};

// generate a true_type for template operator() members in F that are callable with a
// 'const A &' argument even if the template parameter to operator() is fixed to 'A'.
template <
    typename F, typename A
#ifdef Vc_ICC
    // this ensures that F is a generic lambda. We can be pretty sure that noone wrote a
    // lambda with Vc::Traits::is_functor_argument_immutable_impl::dummy parameter
    // type. In theory, this is not needed because the return type fails with a
    // substitution failure in that case. Only ICC generates and error instead of doing
    // SFINAE.
    ,
    typename = decltype(std::declval<F &>()(std::declval<dummy &>()))
#endif
    ,
#ifdef Vc_MSVC
// MSVC fails if the template keyword is used to *correctly* tell the compiler that <A> is
// an explicit template instantiation of operator()
#define Vc_TEMPLATE_
#else
#define Vc_TEMPLATE_ template
#endif
    typename MemberPtr = decltype(&F::Vc_TEMPLATE_ operator()<A>)>
decltype(is_functor_argument_immutable_impl::test(std::declval<MemberPtr>())) test2(int);
#undef Vc_TEMPLATE_

// generate a true_type for non-template operator() members in F that are callable with a
// 'const A &' argument.
template <typename F, typename A>
decltype(
    is_functor_argument_immutable_impl::test(std::declval<decltype(&F::operator())>()))
test2(float);

template <typename A> std::true_type   test3(void(*)(A));
template <typename A> std::is_const<A> test3(void(*)(A &));
template <typename A> std::is_const<A> test3(void(*)(A &&));

}  // namespace is_functor_argument_immutable_impl

template <typename F, typename A, bool = std::is_function<F>::value>
struct is_functor_argument_immutable;
template <typename F, typename A>
struct is_functor_argument_immutable<F, A, false>
    : public decltype(is_functor_argument_immutable_impl::test2<
                      typename std::remove_reference<F>::type, A>(int())) {
};
template <typename F, typename A>
struct is_functor_argument_immutable<F, A, true>
    : public decltype(is_functor_argument_immutable_impl::test3(std::declval<F>())) {
};

}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_IS_OUTPUT_ITERATOR_H_
#define VC_TRAITS_IS_OUTPUT_ITERATOR_H_

#include <iterator>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace is_output_iterator_impl
{
template <typename T, typename ValueType = typename std::iterator_traits<T>::value_type,
          typename = decltype(*std::declval<T &>() = std::declval<
                                  ValueType>())  // tests that assignment to a
                                                 // dereferenced iterator is possible, if
                                                 // yes, T is an OutputIterator
          >
std::true_type test(int);
template <typename T> std::false_type test(...);
}  // namespace is_output_iterator_impl

template <typename T>
struct is_output_iterator
    : public std::conditional<
          std::is_void<typename std::iterator_traits<T>::value_type>::value,
          std::true_type, decltype(is_output_iterator_impl::test<T>(int()))>::type
{
};

static_assert(!std::is_void<std::iterator_traits<int *>::value_type>::value, "");
static_assert(is_output_iterator<int *>::value, "");
static_assert(!is_output_iterator<const int *>::value, "");

}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_IS_OUTPUT_ITERATOR_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_IS_INDEX_SEQUENCE_H_
#define VC_IS_INDEX_SEQUENCE_H_

/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_INDEXSEQUENCE_H_
#define VC_COMMON_INDEXSEQUENCE_H_

#include <Vc/global.h>

namespace Vc_VERSIONED_NAMESPACE
{
/** \internal
 * Helper class for a sequence of size_t values from 0 to N. This type will be included in
 * C++14.
 */
template <std::size_t... I> struct index_sequence
{
    static constexpr std::size_t size() noexcept { return sizeof...(I); }
};

/** \internal
 * This struct builds an index_sequence type from a given upper bound \p N.
 * It does so recursively via concatenation of to index sequences of length N/2.
 */
template <std::size_t N> struct make_index_sequence_impl {
    template <std::size_t Offset, std::size_t... Ns>
    static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
                                                        index_sequence<Ns...>);
    template <std::size_t Offset, std::size_t... Ns>
    static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
        std::true_type, index_sequence<Ns...>);

    using is_odd = std::integral_constant<bool, N & 1>;
    using half = typename make_index_sequence_impl<N / 2>::type;
    using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
};
template <> struct make_index_sequence_impl<0> {
    using type = index_sequence<>;
};
template <> struct make_index_sequence_impl<1> {
    using type = index_sequence<0>;
};
template <> struct make_index_sequence_impl<2> {
    using type = index_sequence<0, 1>;
};

/** \internal
 * Creates an index_sequence type for the upper bound \p N.
 */
template <std::size_t N>
using make_index_sequence = typename make_index_sequence_impl<N>::type;
}

#endif  // VC_COMMON_INDEXSEQUENCE_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{

template <typename T> struct is_index_sequence : public std::false_type {};
template <std::size_t... I>
struct is_index_sequence<Vc::index_sequence<I...>> : public std::true_type {};

static_assert(!is_index_sequence<int>::value, "");
static_assert(is_index_sequence<make_index_sequence<2>>::value, "");

}  // namespace Traits
}  // namespace Vc

#endif  // VC_IS_INDEX_SEQUENCE_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
#define VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename From, typename To, bool = std::is_integral<From>::value>
struct is_implicit_cast_allowed
    : public std::integral_constant<
          bool, std::is_same<From, To>::value ||
                    (std::is_integral<To>::value &&
                     (std::is_same<typename std::make_unsigned<From>::type, To>::value ||
                      std::is_same<typename std::make_signed<From>::type, To>::value))> {
};

template <typename From, typename To>
struct is_implicit_cast_allowed<From, To, false> : public std::is_same<From, To>::type {
};

template <typename From, typename To>
struct is_implicit_cast_allowed_mask : public is_implicit_cast_allowed<From, To> {
};

static_assert(is_implicit_cast_allowed<float, float>::value, "");
static_assert(!is_implicit_cast_allowed<float, double>::value, "");
static_assert(is_implicit_cast_allowed< int64_t, uint64_t>::value, "");
static_assert(is_implicit_cast_allowed<uint64_t,  int64_t>::value, "");
static_assert(is_implicit_cast_allowed< int32_t, uint32_t>::value, "");
static_assert(is_implicit_cast_allowed<uint32_t,  int32_t>::value, "");
static_assert(is_implicit_cast_allowed< int16_t, uint16_t>::value, "");
static_assert(is_implicit_cast_allowed<uint16_t,  int16_t>::value, "");
static_assert(is_implicit_cast_allowed<  int8_t,  uint8_t>::value, "");
static_assert(is_implicit_cast_allowed< uint8_t,   int8_t>::value, "");

}  // namespace Traits
}  // namespace Vc

#endif  // VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
// meta-programming helpers
struct enable_if_default_type
{
    constexpr enable_if_default_type() {}
};
static constexpr enable_if_default_type nullarg;
template <bool Test, typename T = enable_if_default_type> using enable_if = typename std::enable_if<Test, T>::type;

namespace Traits
{
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
#define VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_

namespace has_subscript_operator_impl
{

template <typename T, typename I, typename = decltype(std::declval<T &>()[std::declval<I>()])> std::true_type test(int);
template <typename T, typename I> std::false_type test(float);

}  // namespace has_subscript_operator_impl

template <typename T, typename I = std::size_t>
struct has_subscript_operator : public decltype(has_subscript_operator_impl::test<T, I>(1))
{
};

static_assert(has_subscript_operator<int[]>::value, "");
static_assert(has_subscript_operator<int[], int>::value, "");
static_assert(!has_subscript_operator<int[], void *>::value, "");

#endif // VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
#define VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_

namespace has_multiply_operator_impl
{

template <typename T, typename U, typename = decltype(std::declval<T>() * std::declval<U>())> std::true_type test(int);
template <typename T, typename U> std::false_type test(...);

}  // namespace has_multiply_operator_impl

template <typename T, typename U = T>
struct has_multiply_operator : public decltype(has_multiply_operator_impl::test<T, U>(1))
{
};

#endif // VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_ADDITION_OPERATOR_H_
#define VC_TRAITS_HAS_ADDITION_OPERATOR_H_

namespace has_addition_operator_impl
{

template <typename T, typename U, typename = decltype(std::declval<T>() + std::declval<U>())> std::true_type test(int);
template <typename T, typename U> std::false_type test(...);

}  // namespace has_addition_operator_impl

template <typename T, typename U = T>
struct has_addition_operator : public decltype(has_addition_operator_impl::test<T, U>(1))
{
};

#endif // VC_TRAITS_HAS_ADDITION_OPERATOR_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
#define VC_TRAITS_HAS_EQUALITY_OPERATOR_H_

namespace has_equality_operator_impl
{

template <typename T, typename U,
          typename = enable_if<!std::is_same<void, decltype(std::declval<T>() == std::declval<U>())>::value>>
std::true_type test(int);
template <typename T, typename U> std::false_type test(...);

}  // namespace has_equality_operator_impl

template <typename T, typename U = T>
struct has_equality_operator : public decltype(has_equality_operator_impl::test<T, U>(1))
{
};

static_assert(has_equality_operator<int>::value, "has_equality_operator fails");
namespace
{
class Foobar {};
static_assert(!has_equality_operator<Foobar>::value, "has_equality_operator fails");
} // unnamed namespace

#endif  // VC_TRAITS_HAS_EQUALITY_OPERATOR_H_

template<typename T> struct is_simd_mask_internal : public std::false_type {};
template<typename T> struct is_simd_vector_internal : public std::false_type {};
template<typename T> struct is_subscript_operation_internal : public std::false_type {};
template<typename T> struct is_simdarray_internal : public std::false_type {};
template<typename T> struct is_simd_mask_array_internal : public std::false_type {};
template<typename T> struct is_loadstoreflag_internal : public std::false_type {};

/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

template <std::size_t, typename... Args> struct is_gather_signature_impl;
template <std::size_t N_, typename Arg0, typename Arg1, typename... MoreArguments>
struct is_gather_signature_impl<N_, Arg0, Arg1, MoreArguments...>
    : public std::integral_constant<bool, has_subscript_operator<Arg0>::value &&
                                              !is_loadstoreflag_internal<Arg1>::value &&
                                              has_subscript_operator<Arg1>::value> {
};
template<typename... Args> struct is_gather_signature_impl<0, Args...> : public std::false_type {};
template<typename... Args> struct is_gather_signature_impl<1, Args...> : public std::false_type {};

template <typename... Args> struct is_gather_signature : public is_gather_signature_impl<sizeof...(Args), decay<Args>...> {};

template <std::size_t, typename... Args> struct is_cast_arguments_internal : public std::false_type {};
template <typename Arg>
struct is_cast_arguments_internal<1, Arg> : public std::integral_constant<
                                                bool,
                                                is_simdarray_internal<Arg>::value ||
                                                    is_simd_vector_internal<Arg>::value>
{
};

template <typename T, bool = is_simd_vector_internal<T>::value> struct is_integral_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_floating_point_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_signed_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_unsigned_internal;

template <typename T> struct is_integral_internal      <T, false> : public std::is_integral      <T> {};
template <typename T> struct is_floating_point_internal<T, false> : public std::is_floating_point<T> {};
template <typename T> struct is_signed_internal        <T, false> : public std::is_signed        <T> {};
template <typename T> struct is_unsigned_internal      <T, false> : public std::is_unsigned      <T> {};

template <typename V> struct is_integral_internal      <V, true> : public std::is_integral      <typename V::EntryType> {};
template <typename V> struct is_floating_point_internal<V, true> : public std::is_floating_point<typename V::EntryType> {};
template <typename V> struct is_signed_internal        <V, true> : public std::is_signed        <typename V::EntryType> {};
template <typename V> struct is_unsigned_internal      <V, true> : public std::is_unsigned      <typename V::EntryType> {};

template <typename T>
struct is_arithmetic_internal
    : public std::integral_constant<
          bool,
          (is_floating_point_internal<T>::value || is_integral_internal<T>::value)>
{
};

template <typename T,
          bool = (is_simd_vector_internal<T>::value || is_simd_mask_internal<T>::value ||
                  is_simdarray_internal<T>::value ||
                  is_simd_mask_array_internal<T>::value)>
struct vector_size_internal;

template <typename T>
struct vector_size_internal<T, true> : public std::integral_constant<std::size_t, T::Size>
{
};
template <typename T>
struct vector_size_internal<T, false> : public std::integral_constant<std::size_t, 0>
{
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/**
 * Identifies any SIMD mask type (independent of implementation or whether it's
 * SimdMaskArray<T, N>).
 */
template <typename T>
struct is_simd_mask : public std::integral_constant<bool,
                                                    (is_simd_mask_internal<decay<T>>::value ||
                                                     is_simd_mask_array_internal<decay<T>>::value)>
{
};

/**
 * Identifies any SIMD vector type (independent of implementation or whether it's
 * SimdArray<T, N>).
 */
template <typename T>
struct is_simd_vector
    : public std::integral_constant<bool,
                                    (is_simd_vector_internal<decay<T>>::value ||
                                     is_simdarray_internal<decay<T>>::value)>
{
};

/// Identifies any possible SimdArray<T, N> type (independent of const/volatile or reference)
template <typename T>
struct isSimdArray : public is_simdarray_internal<decay<T>>
{
};

/// Identifies any possible SimdMaskArray<T, N> type (independent of const/volatile or reference)
template <typename T>
struct isSimdMaskArray : public is_simd_mask_array_internal<decay<T>>
{
};

/// \internal Identifies SubscriptOperation types
template <typename T> struct is_subscript_operation : public is_subscript_operation_internal<decay<T>> {};
/// \internal Identifies LoadStoreFlag types
template <typename T> struct is_load_store_flag : public is_loadstoreflag_internal<decay<T>> {};
/// \internal Identifies the function signature of a cast
template <typename... Args> struct is_cast_arguments : public is_cast_arguments_internal<sizeof...(Args), decay<Args>...> {};

/// \internal Identifies a SimdArray type with a single Vector member
template <typename T> struct is_atomic_simdarray_internal : public std::false_type {};
template <typename T> using isAtomicSimdArray = is_atomic_simdarray_internal<decay<T>>;

/// \internal Identifies a SimdMaskArray type with a single Mask member
template <typename T> struct is_atomic_simd_mask_array_internal : public std::false_type {};
template <typename T> using isAtomicSimdMaskArray = is_atomic_simd_mask_array_internal<decay<T>>;

/**
 * The \p value member will either be the number of SIMD vector entries or 0 if \p T is not a SIMD
 * type.
 */
template <typename T> struct simd_vector_size : public vector_size_internal<decay<T>> {};

template <typename T> struct is_integral : public is_integral_internal<decay<T>> {};
template <typename T> struct is_floating_point : public is_floating_point_internal<decay<T>> {};
template <typename T> struct is_arithmetic : public is_arithmetic_internal<decay<T>> {};
template <typename T> struct is_signed : public is_signed_internal<decay<T>> {};
template <typename T> struct is_unsigned : public is_unsigned_internal<decay<T>> {};

template <typename T, bool IsSimdVector> struct scalar_type_internal { using type = T; };
template <typename T> struct scalar_type_internal<T, true> { using type = typename T::EntryType; };
template <typename T> using scalar_type = typename scalar_type_internal<decay<T>, is_simd_vector<T>::value>::type;

}  // namespace Traits
}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_TRAITS_ENTRY_TYPE_OF_H_
#define VC_TRAITS_ENTRY_TYPE_OF_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace entry_type_of_internal
{
template <typename T, bool = Traits::is_simd_vector<T>::value> struct entry_type;

template <typename T> struct entry_type<T, true>
{
    using type = typename decay<T>::EntryType;
};

template <typename T> struct entry_type<T, false>
{
    using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
};
}  // namespace entry_type_of_internal

/**
 * Resolves to T::EntryType if \p T is a SIMD type, otherwise it resolves to \p T itself.
 */
template <typename T> using entry_type_of = typename entry_type_of_internal::entry_type<T>::type;

}  // namespace Traits
}  // namespace Vc_VERSIONED_NAMESPACE

#endif  // VC_TRAITS_ENTRY_TYPE_OF_H_

// vim: foldmethod=marker

#endif // VC_TRAITS_TYPE_TRAITS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_PERMUTATION_H_
#define VC_COMMON_PERMUTATION_H_

/*  This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_MACROS_H_
#define VC_COMMON_MACROS_H_

#include <Vc/global.h>


#ifdef Vc_MSVC
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
    typedef __declspec(align(n_)) type_ new_type_
#elif __GNUC__
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
    typedef type_ new_type_[[gnu::aligned(n_)]]
#else  // the following is actually ill-formed according to C++1[14]
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
    using new_type_ alignas(sizeof(n_)) = type_
#endif

// On Windows (WIN32) we might see macros called min and max. Just undefine them and hope
// noone (re)defines them (NOMINMAX should help).
#ifdef WIN32
#define NOMINMAX 1
#if defined min
#undef min
#endif
#if defined max
#undef max
#endif
#endif  // WIN32

#if defined Vc_GCC && Vc_GCC >= 0x60000
// GCC 6 drops all attributes on types passed as template arguments. This is important
// if a may_alias gets lost and therefore needs to be readded in the implementation of
// the class template.
#define Vc_TEMPLATES_DROP_ATTRIBUTES 1
#endif

#if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000)
// GCC 6 optimizes the RowMemory::fromRawData hack away (common/memorybase.h). Therefore
// the 2D Memory class is implemented recursively using 1D Memory members. Since this is
// an ABI break this is only enabled for GCC 6. With Vc 2.x all implementations should do
// this.
#define Vc_RECURSIVE_MEMORY 1
#endif

#if defined Vc_CLANG || defined Vc_APPLECLANG
#  define Vc_UNREACHABLE __builtin_unreachable
#  define Vc_NEVER_INLINE [[gnu::noinline]]
#  define Vc_INTRINSIC_L inline
#  define Vc_INTRINSIC_R __attribute__((always_inline))
#  define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
#  define Vc_FLATTEN
#  define Vc_CONST __attribute__((const))
#  define Vc_CONST_L
#  define Vc_CONST_R Vc_CONST
#  define Vc_PURE __attribute__((pure))
#  define Vc_PURE_L
#  define Vc_PURE_R Vc_PURE
#  define Vc_MAY_ALIAS __attribute__((may_alias))
#  define Vc_ALWAYS_INLINE_L inline
#  define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
#  define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
#  define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
#  define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
#  define Vc_RESTRICT __restrict__
#  define Vc_DEPRECATED(msg)
#  define Vc_DEPRECATED_ALIAS(msg)
#  define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#elif defined(__GNUC__)
#  define Vc_UNREACHABLE __builtin_unreachable
#  if defined Vc_GCC && !defined __OPTIMIZE__
#    define Vc_MAY_ALIAS
#  else
#    define Vc_MAY_ALIAS __attribute__((__may_alias__))
#  endif
#  define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
#  define Vc_INTRINSIC_L inline
#  define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
#  define Vc_FLATTEN __attribute__((__flatten__))
#  define Vc_ALWAYS_INLINE_L inline
#  define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
#  define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
#  ifdef Vc_ICC
// ICC miscompiles if there are functions marked as pure or const
#    define Vc_PURE
#    define Vc_CONST
#    define Vc_NEVER_INLINE
#  else
#    define Vc_NEVER_INLINE [[gnu::noinline]]
#    define Vc_PURE __attribute__((__pure__))
#    define Vc_CONST __attribute__((__const__))
#  endif
#  define Vc_CONST_L
#  define Vc_CONST_R Vc_CONST
#  define Vc_PURE_L
#  define Vc_PURE_R Vc_PURE
#  define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
#  define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
#  define Vc_RESTRICT __restrict__
#  ifdef Vc_ICC
#    define Vc_DEPRECATED(msg)
#    define Vc_DEPRECATED_ALIAS(msg)
#  else
#    define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
#    define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
#  endif
#  define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#else
#  define Vc_NEVER_INLINE
#  define Vc_FLATTEN
#  ifdef Vc_PURE
#    undef Vc_PURE
#  endif
#  define Vc_MAY_ALIAS
#  ifdef Vc_MSVC
#    define Vc_ALWAYS_INLINE inline __forceinline
#    define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
#    define Vc_ALWAYS_INLINE_R
#    define Vc_CONST __declspec(noalias)
#    define Vc_CONST_L Vc_CONST
#    define Vc_CONST_R
#    define Vc_PURE /*Vc_CONST*/
#    define Vc_PURE_L Vc_PURE
#    define Vc_PURE_R
#    define Vc_INTRINSIC inline __forceinline
#    define Vc_INTRINSIC_L Vc_INTRINSIC
#    define Vc_INTRINSIC_R
namespace Vc_VERSIONED_NAMESPACE {
namespace detail
{
static Vc_INTRINSIC void unreachable() { __assume(0); }
}  // namespace detail
}
#    define Vc_UNREACHABLE Vc::detail::unreachable
#  else
#    define Vc_ALWAYS_INLINE
#    define Vc_ALWAYS_INLINE_L
#    define Vc_ALWAYS_INLINE_R
#    define Vc_CONST
#    define Vc_CONST_L
#    define Vc_CONST_R
#    define Vc_PURE
#    define Vc_PURE_L
#    define Vc_PURE_R
#    define Vc_INTRINSIC
#    define Vc_INTRINSIC_L
#    define Vc_INTRINSIC_R
#    define Vc_UNREACHABLE std::abort
#  endif
#  define Vc_IS_UNLIKELY(x) x
#  define Vc_IS_LIKELY(x) x
#  define Vc_RESTRICT __restrict
#  define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
#  define Vc_DEPRECATED_ALIAS(msg)
#  define Vc_WARN_UNUSED_RESULT
#endif

#ifdef Vc_CXX14
#undef Vc_DEPRECATED
#define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
#endif

#define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")

#define Vc_FREE_STORE_OPERATORS_ALIGNED(align_)                                          \
    /**\name new/delete overloads for correct alignment */                               \
    /**@{*/                                                                              \
    /*!\brief Allocates correctly aligned memory */                                      \
    Vc_ALWAYS_INLINE void *operator new(size_t size)                                     \
    {                                                                                    \
        return Vc::Common::aligned_malloc<align_>(size);                                 \
    }                                                                                    \
    /*!\brief Returns \p p. */                                                           \
    Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; }                   \
    /*!\brief Allocates correctly aligned memory */                                      \
    Vc_ALWAYS_INLINE void *operator new[](size_t size)                                   \
    {                                                                                    \
        return Vc::Common::aligned_malloc<align_>(size);                                 \
    }                                                                                    \
    /*!\brief Returns \p p. */                                                           \
    Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; }                 \
    /*!\brief Frees aligned memory. */                                                   \
    Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); }  \
    /*!\brief Does nothing. */                                                           \
    Vc_ALWAYS_INLINE void operator delete(void *, void *) {}                             \
    /*!\brief Frees aligned memory. */                                                   \
    Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t)                           \
    {                                                                                    \
        Vc::Common::free(ptr);                                                           \
    }                                                                                    \
    /*!\brief Does nothing. */                                                           \
    Vc_ALWAYS_INLINE void operator delete[](void *, void *) {}                           \
    /**@}*/                                                                              \
    Vc_NOTHING_EXPECTING_SEMICOLON

#ifdef Vc_ASSERT
#define Vc_EXTERNAL_ASSERT 1
#else
#ifdef NDEBUG
#define Vc_ASSERT(x)
#else
#include <assert.h>
#define Vc_ASSERT(x) assert(x);
#endif
#endif

#if defined Vc_CLANG || defined Vc_APPLECLANG
#define Vc_HAS_BUILTIN(x) __has_builtin(x)
#else
#define Vc_HAS_BUILTIN(x) 0
#endif

#define Vc_CAT_HELPER_(a, b, c, d) a##b##c##d
#define Vc_CAT(a, b, c, d) Vc_CAT_HELPER_(a, b, c, d)

#define Vc_CAT_IMPL(a, b) a##b
#define Vc_CAT2(a, b) Vc_CAT_IMPL(a, b)

#define Vc_APPLY_IMPL_1_(macro, a, b, c, d, e) macro(a)
#define Vc_APPLY_IMPL_2_(macro, a, b, c, d, e) macro(a, b)
#define Vc_APPLY_IMPL_3_(macro, a, b, c, d, e) macro(a, b, c)
#define Vc_APPLY_IMPL_4_(macro, a, b, c, d, e) macro(a, b, c, d)
#define Vc_APPLY_IMPL_5_(macro, a, b, c, d, e) macro(a, b, c, d, e)

#define Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
    size(macro, double_v, a, b, c, d) \
    size(macro,  float_v, a, b, c, d)
#define Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \
    size(macro,    int_v, a, b, c, d) \
    size(macro,   uint_v, a, b, c, d) \
    size(macro,  short_v, a, b, c, d) \
    size(macro, ushort_v, a, b, c, d)
#define Vc_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \
    Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
    Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
#define Vc_LIST_COMPARES(size, macro, a, b, c, d) \
    size(macro, ==, a, b, c, d) \
    size(macro, !=, a, b, c, d) \
    size(macro, <=, a, b, c, d) \
    size(macro, >=, a, b, c, d) \
    size(macro, < , a, b, c, d) \
    size(macro, > , a, b, c, d)
#define Vc_LIST_LOGICAL(size, macro, a, b, c, d) \
    size(macro, &&, a, b, c, d) \
    size(macro, ||, a, b, c, d)
#define Vc_LIST_BINARY(size, macro, a, b, c, d) \
    size(macro, |, a, b, c, d) \
    size(macro, &, a, b, c, d) \
    size(macro, ^, a, b, c, d)
#define Vc_LIST_SHIFTS(size, macro, a, b, c, d) \
    size(macro, <<, a, b, c, d) \
    size(macro, >>, a, b, c, d)
#define Vc_LIST_ARITHMETICS(size, macro, a, b, c, d) \
    size(macro, +, a, b, c, d) \
    size(macro, -, a, b, c, d) \
    size(macro, *, a, b, c, d) \
    size(macro, /, a, b, c, d) \
    size(macro, %, a, b, c, d)

#define Vc_APPLY_0(_list, macro)             _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_1(_list, macro, a)          _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_2(_list, macro, a, b)       _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_3(_list, macro, a, b, c)    _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_4(_list, macro, a, b, c, d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON

#define Vc_ALL_COMPARES(macro)     Vc_APPLY_0(Vc_LIST_COMPARES, macro)
#define Vc_ALL_LOGICAL(macro)      Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
#define Vc_ALL_BINARY(macro)       Vc_APPLY_0(Vc_LIST_BINARY, macro)
#define Vc_ALL_SHIFTS(macro)       Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
#define Vc_ALL_ARITHMETICS(macro)  Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
#define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
#define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)

#define Vc_EXACT_TYPE(_test, _reference, _type) \
    typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type

#define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)

#if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
#define Vc_OFFSETOF(Type, member) (reinterpret_cast<const char *>(&reinterpret_cast<const Type *>(0)->member) - reinterpret_cast<const char *>(0))
#else
#define Vc_OFFSETOF(Type, member) offsetof(Type, member)
#endif

#if defined(Vc_NO_NOEXCEPT)
#define Vc_NOEXCEPT throw()
#else
#define Vc_NOEXCEPT noexcept
#endif

#ifdef Vc_NO_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE_L
#undef Vc_ALWAYS_INLINE_R
#define Vc_ALWAYS_INLINE inline
#define Vc_ALWAYS_INLINE_L inline
#define Vc_ALWAYS_INLINE_R
#undef Vc_INTRINSIC
#undef Vc_INTRINSIC_L
#undef Vc_INTRINSIC_R
#define Vc_INTRINSIC inline
#define Vc_INTRINSIC_L inline
#define Vc_INTRINSIC_R
#endif

#endif // VC_COMMON_MACROS_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Permutation
{
struct ReversedTag {};
constexpr ReversedTag Reversed{};
}  // namespace Permutation
}

#endif  // VC_COMMON_PERMUTATION_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_VECTORABI_H_
#define VC_COMMON_VECTORABI_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace VectorAbi
{
struct Scalar {};
struct Sse {};
struct Avx {};
struct Mic {};
template <typename T>
using Avx1Abi = typename std::conditional<std::is_integral<T>::value, VectorAbi::Sse,
                                          VectorAbi::Avx>::type;
template <typename T>
using Best = typename std::conditional<
    CurrentImplementation::is(ScalarImpl), Scalar,
    typename std::conditional<
        CurrentImplementation::is_between(SSE2Impl, SSE42Impl), Sse,
        typename std::conditional<
            CurrentImplementation::is(AVXImpl), Avx1Abi<T>,
            typename std::conditional<
                CurrentImplementation::is(AVX2Impl), Avx,
                typename std::conditional<CurrentImplementation::is(MICImpl), Mic,
                                          void>::type>::type>::type>::type>::type;
#ifdef Vc_IMPL_AVX2
static_assert(std::is_same<Best<float>, Avx>::value, "");
static_assert(std::is_same<Best<int>, Avx>::value, "");
#elif defined Vc_IMPL_AVX
static_assert(std::is_same<Best<float>, Avx>::value, "");
static_assert(std::is_same<Best<int>, Sse>::value, "");
#elif defined Vc_IMPL_SSE
static_assert(CurrentImplementation::is_between(SSE2Impl, SSE42Impl), "");
static_assert(std::is_same<Best<float>, Sse>::value, "");
static_assert(std::is_same<Best<int>, Sse>::value, "");
#elif defined Vc_IMPL_MIC
static_assert(std::is_same<Best<float>, Mic>::value, "");
static_assert(std::is_same<Best<int>, Mic>::value, "");
#elif defined Vc_IMPL_Scalar
static_assert(std::is_same<Best<float>, Scalar>::value, "");
static_assert(std::is_same<Best<int>, Scalar>::value, "");
#endif
}  // namespace VectorAbi
}  // namespace Vc

#endif  // VC_COMMON_VECTORABI_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
template<typename T, typename Abi> class Mask;
template<typename T, typename Abi> class Vector;

///\addtogroup Utilities
///@{

/// \internal Allow writing \c size_t without the `std::` prefix.
using std::size_t;

/// long long shorthand
using llong = long long;
/// unsigned long long shorthand
using ullong = unsigned long long;
/// unsigned long shorthand
using ulong = unsigned long;
/// unsigned int shorthand
using uint = unsigned int;
/// unsigned short shorthand
using ushort = unsigned short;
/// unsigned char shorthand
using uchar = unsigned char;
/// signed char shorthand
using schar = signed char;

/**\internal
 * Tag type for explicit zero-initialization
 */
struct VectorSpecialInitializerZero {};
/**\internal
 * Tag type for explicit one-initialization
 */
struct VectorSpecialInitializerOne {};
/**\internal
 * Tag type for explicit "iota-initialization"
 */
struct VectorSpecialInitializerIndexesFromZero {};

/**
 * The special object \p Vc::Zero can be used to construct Vector and Mask objects
 * initialized to zero/\c false.
 */
constexpr VectorSpecialInitializerZero Zero = {};
/**
 * The special object \p Vc::One can be used to construct Vector and Mask objects
 * initialized to one/\c true.
 */
constexpr VectorSpecialInitializerOne One = {};
/**
 * The special object \p Vc::IndexesFromZero can be used to construct Vector objects
 * initialized to values 0, 1, 2, 3, 4, ...
 */
constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
///@}

namespace Detail
{
template<typename T> struct MayAliasImpl {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
#endif
    typedef T type Vc_MAY_ALIAS;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
};
//template<size_t Bytes> struct MayAlias<MaskBool<Bytes>> { typedef MaskBool<Bytes> type; };
}  // namespace Detail
/**\internal
 * Helper MayAlias<T> that turns T into the type to be used for an aliasing pointer. This
 * adds the may_alias attribute to T (with compilers that support it). But for MaskBool this
 * attribute is already part of the type and applying it a second times leads to warnings/errors,
 * therefore MaskBool is simply forwarded as is.
 */
#ifdef Vc_ICC
template <typename T> using MayAlias [[gnu::may_alias]] = T;
#else
template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
#endif

/**\internal
 * This enumeration lists all possible operators in C++.
 *
 * The assignment and compound assignment enumerators are used with the conditional_assign
 * implementation.
 */
enum class Operator : char {
    Assign,
    Multiply,
    MultiplyAssign,
    Divide,
    DivideAssign,
    Remainder,
    RemainderAssign,
    Plus,
    PlusAssign,
    Minus,
    MinusAssign,
    RightShift,
    RightShiftAssign,
    LeftShift,
    LeftShiftAssign,
    And,
    AndAssign,
    Xor,
    XorAssign,
    Or,
    OrAssign,
    PreIncrement,
    PostIncrement,
    PreDecrement,
    PostDecrement,
    LogicalAnd,
    LogicalOr,
    Comma,
    UnaryPlus,
    UnaryMinus,
    UnaryNot,
    UnaryOnesComplement,
    CompareEqual,
    CompareNotEqual,
    CompareLess,
    CompareGreater,
    CompareLessEqual,
    CompareGreaterEqual
};

// forward declaration for Vc::array in <Vc/array>
template <typename T, std::size_t N> struct array;

/* TODO: add type for half-float, something along these lines:
class half_float
{
    uint16_t data;
public:
    constexpr half_float() : data(0) {}
    constexpr half_float(const half_float &) = default;
    constexpr half_float(half_float &&) = default;
    constexpr half_float &operator=(const half_float &) = default;

    constexpr explicit half_float(float);
    constexpr explicit half_float(double);
    constexpr explicit half_float(int);
    constexpr explicit half_float(unsigned int);

    explicit operator float       () const;
    explicit operator double      () const;
    explicit operator int         () const;
    explicit operator unsigned int() const;

    bool operator==(half_float rhs) const;
    bool operator!=(half_float rhs) const;
    bool operator>=(half_float rhs) const;
    bool operator<=(half_float rhs) const;
    bool operator> (half_float rhs) const;
    bool operator< (half_float rhs) const;

    half_float operator+(half_float rhs) const;
    half_float operator-(half_float rhs) const;
    half_float operator*(half_float rhs) const;
    half_float operator/(half_float rhs) const;
};
*/

// TODO: the following doesn't really belong into the toplevel Vc namespace.
#ifndef Vc_CHECK_ALIGNMENT
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
#else
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
{
    const size_t s = alignof(_T);
    if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
        fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
        abort();
    }
}
#endif

namespace Common
{
/**
 * \internal
 *
 * Helper interface to make m_indexes in InterleavedMemoryAccessBase behave like an integer vector.
 * Only that the entries are successive entries from the given start index.
 */
template<size_t StructSize> class SuccessiveEntries
{
#ifdef Vc_MSVC
    // scatterinterleavedmemory fails with garbage values in m_first if size_type is a
    // 64-bit integer type. Using a 32-bit type seems to work around the miscompilation.
    using size_type = unsigned;
#else
    using size_type = size_t;
#endif
    const size_type m_first;

public:
    typedef SuccessiveEntries AsArg;
    Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
    Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
    {
        return m_first + offset * StructSize;
    }
    Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
    {
        return SuccessiveEntries(m_first + rhs.m_first);
    }
    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
    {
        return SuccessiveEntries(m_first * rhs.m_first);
    }
    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
    {
        return {m_first << x};
    }

    friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
    {
        return x;
    }
    friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
    {
        return x;
    }
};

// declaration for functions in common/malloc.h
template <std::size_t alignment>
Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;

/**\internal
 * Central definition of the type combinations that convert implicitly.
 */
template <typename T, typename U>
using enable_if_mask_converts_implicitly =
    enable_if<(Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
               Traits::is_implicit_cast_allowed_mask<
                   Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
/**\internal
 * Central definition of the type combinations that only convert explicitly.
 */
template <typename T, typename U>
using enable_if_mask_converts_explicitly = enable_if<(
    Traits::isSimdMaskArray<U>::value ||
    (Traits::is_simd_mask<U>::value &&
     !Traits::is_implicit_cast_allowed_mask<
         Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;

/**\internal
 * Tag type for overloading on the width (\VSize{T}) of a vector.
 */
template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;

// forward declaration of MaskBool in common/maskbool.h
template <std::size_t Bytes> class MaskBool;

// forward declaration of SubscriptOperation in common/subscript.h
template <typename T, typename IndexVector, typename Scale, bool>
class SubscriptOperation;

/**
 * \internal
 * Helper type to pass along the two arguments for a gather operation.
 *
 * \tparam IndexVector  Normally an integer SIMD vector, but an array or std::vector also
 *                      works (though often not as efficient).
 */
template <typename T, typename IndexVector> struct GatherArguments
{
    const IndexVector indexes;
    const T *const address;
};

/**
 * \internal
 * Helper type to pass along the two arguments for a scatter operation.
 *
 * \tparam IndexVector  Normally an integer SIMD vector, but an array or std::vector also
 *                      works (though often not as efficient).
 */
template <typename T, typename IndexVector> struct ScatterArguments
{
    const IndexVector indexes;
    T *const address;
};

/**\internal
 * Break the recursion of the function below.
 */
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
{
}

/**\internal
 * Force the code in the lambda \p f to be called with indexes starting from \p Begin up
 * to (excluding) \p End to be called without compare and jump instructions (i.e. an
 * unrolled loop).
 */
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
{
    f(Begin);
    unrolled_loop<I, Begin + 1, End>(f);
}

/**\internal
 * Small simplification of the unrolled_loop call for ranges from 0 to \p Size using
 * std::size_t as the index type.
 */
template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
{
    unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
}

/**\internal
 * The member type `type` is either `T` or `T` with alignment increased to sizeof(T) if
 * alignof(T) < sizeof(T).
 */
template <class T, std::size_t Size = sizeof(T), std::size_t Alignment = alignof(T)>
struct ensure_alignment_equals_sizeof {
    Vc_ALIGNED_TYPEDEF(Size, T, type);
};
template <class T, std::size_t Size>
struct ensure_alignment_equals_sizeof<T, Size, Size> {
    using type = T;
};

}  // namespace Common
}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_VECTOR_H_
#define VC_COMMON_VECTOR_H_

#include <ratio>
/*  This file is part of the Vc library. {{{
Copyright © 2016 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_ELEMENTREFERENCE_H_
#define VC_COMMON_ELEMENTREFERENCE_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename U, typename Accessor = U> class ElementReference
{
    using value_type = typename U::value_type;
    friend U;
    friend Accessor;
    Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}

    static constexpr bool get_noexcept =
        noexcept(Accessor::get(std::declval<U &>(), int()));
    template <typename T> static constexpr bool set_noexcept()
    {
        return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
    }

public:
    Vc_INTRINSIC ElementReference(const ElementReference &) = delete;

    /**
     * Move Constructor
     *
     * this is the only way to constructor an ElementReference in user code
     *
     * \note
     * Please be aware that this class models the concept of a reference
     * and as such it can have the same lifetime issue as a standard C++
     * reference.
     *
     * \note
     * C++ 17 support copy-elision, which in turn allows to
     * the ElementReference obtained via operator[] from a function
     * and avoid copying. C++11 and C++14 don't offer this, thus we add
     * the move constructor, to allow them to move the data and thus avoid
     * copying (which was prohibited by the deleted constructor above
     */
    Vc_INTRINSIC ElementReference(ElementReference &&) = default;

    Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
    {
        return Accessor::get(obj, index);
    }

    template <typename T>
        Vc_INTRINSIC ElementReference &operator=(T &&x) &&
        noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
    {
        Accessor::set(obj, index, std::forward<T>(x));
        return *this;
    }

// TODO: improve with operator.()

#define Vc_OP_(op_)                                                                      \
    template <typename T, typename R = decltype(std::declval<const value_type &>()       \
                                                    op_ std::declval<T>())>              \
        Vc_INTRINSIC ElementReference &operator op_##=(T &&x) &&                         \
        noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(),      \
                                                        std::declval<R &&>())))          \
    {                                                                                    \
        const value_type &lhs = Accessor::get(obj, index);                               \
        Accessor::set(obj, index, lhs op_ std::forward<T>(x));                           \
        return *this;                                                                    \
    }
    Vc_ALL_ARITHMETICS(Vc_OP_);
    Vc_ALL_SHIFTS(Vc_OP_);
    Vc_ALL_BINARY(Vc_OP_);
#undef Vc_OP_

    template <typename = void>
        Vc_INTRINSIC ElementReference &operator++() &&
        noexcept(noexcept(std::declval<value_type &>() =
                              Accessor::get(std::declval<U &>(), int())) &&
                 set_noexcept<decltype(++std::declval<value_type &>())>())
    {
        value_type x = Accessor::get(obj, index);
        Accessor::set(obj, index, ++x);
        return *this;
    }

    template <typename = void>
        Vc_INTRINSIC value_type operator++(int) &&
        noexcept(noexcept(std::declval<value_type &>() =
                              Accessor::get(std::declval<U &>(), int())) &&
                 set_noexcept<decltype(std::declval<value_type &>()++)>())
    {
        const value_type r = Accessor::get(obj, index);
        value_type x = r;
        Accessor::set(obj, index, ++x);
        return r;
    }

    template <typename = void>
        Vc_INTRINSIC ElementReference &operator--() &&
        noexcept(noexcept(std::declval<value_type &>() =
                              Accessor::get(std::declval<U &>(), int())) &&
                 set_noexcept<decltype(--std::declval<value_type &>())>())
    {
        value_type x = Accessor::get(obj, index);
        Accessor::set(obj, index, --x);
        return *this;
    }

    template <typename = void>
        Vc_INTRINSIC value_type operator--(int) &&
        noexcept(noexcept(std::declval<value_type &>() =
                              Accessor::get(std::declval<U &>(), int())) &&
                 set_noexcept<decltype(std::declval<value_type &>()--)>())
    {
        const value_type r = Accessor::get(obj, index);
        value_type x = r;
        Accessor::set(obj, index, --x);
        return r;
    }

    friend void swap(ElementReference &&a, ElementReference &&b) {
        value_type tmp(a);
        static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
        static_cast<ElementReference &&>(b) = tmp;
    }

    friend void swap(value_type &a, ElementReference &&b) {
        value_type tmp(a);
        a = static_cast<value_type>(b);
        static_cast<ElementReference &&>(b) = tmp;
    }

    friend void swap(ElementReference &&a, value_type &b) {
        value_type tmp(a);
        static_cast<ElementReference &&>(a) = b;
        b = tmp;
    }

private:
    int index;
    U &obj;
};

}  // namespace Detail
}  // namespace Vc

#endif  // VC_COMMON_ELEMENTREFERENCE_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_VECTORTRAITS_H_
#define VC_COMMON_VECTORTRAITS_H_


namespace Vc_VERSIONED_NAMESPACE
{
template <typename T, typename Abi> struct VectorTraits;
}  // namespace Vc

#endif  // VC_COMMON_VECTORTRAITS_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_SIMDARRAYFWD_H_
#define VC_COMMON_SIMDARRAYFWD_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


#ifndef VC_SSE_TYPES_H_
#define VC_SSE_TYPES_H_

#ifdef Vc_DEFAULT_IMPL_SSE
#define Vc_DOUBLE_V_SIZE 2
#define Vc_FLOAT_V_SIZE 4
#define Vc_INT_V_SIZE 4
#define Vc_UINT_V_SIZE 4
#define Vc_SHORT_V_SIZE 8
#define Vc_USHORT_V_SIZE 8
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Sse>;
typedef Vector<double>         double_v;
typedef Vector<float>           float_v;
typedef Vector<int>               int_v;
typedef Vector<unsigned int>     uint_v;
typedef Vector<short>           short_v;
typedef Vector<unsigned short> ushort_v;

template <typename T> using Mask = Vc::Mask<T, VectorAbi::Sse>;
typedef Mask<double>         double_m;
typedef Mask<float>           float_m;
typedef Mask<int>               int_m;
typedef Mask<unsigned int>     uint_m;
typedef Mask<short>           short_m;
typedef Mask<unsigned short> ushort_m;

template <typename T> struct Const;

template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace SSE

namespace Traits
{
template <typename T> struct is_simd_mask_internal<SSE::Mask<T>> : public std::true_type {};
template <typename T> struct is_simd_vector_internal<SSE::Vector<T>> : public std::true_type {};
}  // namespace Traits
}  // namespace Vc

#endif // VC_SSE_TYPES_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_TYPES_H_
#define VC_AVX_TYPES_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


#ifndef VC_AVX_MACROS_H_
#define VC_AVX_MACROS_H_

#endif // VC_AVX_MACROS_H_

#ifdef Vc_DEFAULT_IMPL_AVX2
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 8
#define Vc_UINT_V_SIZE 8
#define Vc_SHORT_V_SIZE 16
#define Vc_USHORT_V_SIZE 16
#elif defined Vc_DEFAULT_IMPL_AVX
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 4
#define Vc_UINT_V_SIZE 4
#define Vc_SHORT_V_SIZE 8
#define Vc_USHORT_V_SIZE 8
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
typedef Vector<double>         double_v;
typedef Vector<float>           float_v;
typedef Vector<int>               int_v;
typedef Vector<unsigned int>     uint_v;
typedef Vector<short>           short_v;
typedef Vector<unsigned short> ushort_v;

template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
typedef Mask<double>         double_m;
typedef Mask<float>           float_m;
typedef Mask<int>               int_m;
typedef Mask<unsigned int>     uint_m;
typedef Mask<short>           short_m;
typedef Mask<unsigned short> ushort_m;

template <typename T> struct Const;

template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace AVX

namespace AVX2
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
using double_v = Vector<double>;
using  float_v = Vector< float>;
using    int_v = Vector<   int>;
using   uint_v = Vector<  uint>;
using  short_v = Vector< short>;
using ushort_v = Vector<ushort>;

template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
using double_m = Mask<double>;
using  float_m = Mask< float>;
using  llong_m = Mask< llong>;
using ullong_m = Mask<ullong>;
using   long_m = Mask<  long>;
using  ulong_m = Mask< ulong>;
using    int_m = Mask<   int>;
using   uint_m = Mask<  uint>;
using  short_m = Mask< short>;
using ushort_m = Mask<ushort>;
using  schar_m = Mask< schar>;
using  uchar_m = Mask< uchar>;

template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace AVX2

namespace Traits
{
template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>> : public std::true_type {};
template<typename T> struct is_simd_vector_internal<Vector<T, VectorAbi::Avx>> : public std::true_type {};
}  // namespace Traits
}  // namespace Vc

#endif // VC_AVX_TYPES_H_
/*  This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


#ifndef VC_MIC_TYPES_H_
#define VC_MIC_TYPES_H_

#ifdef Vc_DEFAULT_IMPL_MIC
#define Vc_DOUBLE_V_SIZE 8
#define Vc_FLOAT_V_SIZE 16
#define Vc_INT_V_SIZE 16
#define Vc_UINT_V_SIZE 16
#define Vc_SHORT_V_SIZE 16
#define Vc_USHORT_V_SIZE 16
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace MIC
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Mic>;
typedef Vector<double>         double_v;
typedef Vector<float>           float_v;
typedef Vector<int>               int_v;
typedef Vector<unsigned int>     uint_v;
typedef Vector<short>           short_v;
typedef Vector<unsigned short> ushort_v;
typedef Vector<schar>           schar_v;
typedef Vector<uchar>           uchar_v;

template <typename T> using Mask = Vc::Mask<T, VectorAbi::Mic>;
typedef Mask<double>         double_m;
typedef Mask<float>           float_m;
typedef Mask<int>               int_m;
typedef Mask<unsigned int>     uint_m;
typedef Mask<short>           short_m;
typedef Mask<unsigned short> ushort_m;
typedef Mask<schar>           schar_m;
typedef Mask<uchar>           uchar_m;

template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace MIC

namespace Traits
{
template <typename T> struct is_simd_mask_internal<MIC::Mask<T>> : public std::true_type {};
template <typename T> struct is_simd_vector_internal<MIC::Vector<T>> : public std::true_type {};
}  // namespace Traits
}  // namespace Vc

#endif // VC_MIC_TYPES_H_

/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_UTILITY_H_
#define VC_COMMON_UTILITY_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
/**
 * \internal
 * Returns the next power of 2 larger than or equal to \p x.
 */
template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
template <size_t x>
struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
};
template <size_t x>
struct NextPowerOfTwo<x, false>
    : public std::integral_constant<
          size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
};

/**
 * \internal
 * Enforce an upper bound to an alignment value. This is necessary because some compilers
 * implement such an upper bound and emit a warning if it is encountered.
 */
template <size_t A>
struct BoundedAlignment : public std::integral_constant<size_t,
#if defined Vc_MSVC || defined Vc_GCC
                                                        ((A - 1) &
#ifdef Vc_MSVC
                                                         31
#elif defined __AVX__
                                                         255
#else
                                                         127
#endif
                                                         ) + 1
#else
                                                        A
#endif
                                                        > {
};

/**
 * \internal
 * Returns the size of the left/first SimdArray member.
 */
template <std::size_t N> static constexpr std::size_t left_size()
{
    return Common::NextPowerOfTwo<(N + 1) / 2>::value;
}
/**
 * \internal
 * Returns the size of the right/second SimdArray member.
 */
template <std::size_t N> static constexpr std::size_t right_size()
{
    return N - left_size<N>();
}

}  // namespace Common
}  // namespace Vc

#endif  // VC_COMMON_UTILITY_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
/// \addtogroup SimdArray
/// @{
/*select_best_vector_type{{{*/
/**
 * \internal
 * Selects the best SIMD type out of a typelist to store N scalar values.
 */
template<std::size_t N, typename... Typelist> struct select_best_vector_type_impl;

template<std::size_t N, typename T> struct select_best_vector_type_impl<N, T>
{
    using type = T;
};
template<std::size_t N, typename T, typename... Typelist> struct select_best_vector_type_impl<N, T, Typelist...>
{
    using type = typename std::conditional<
        (N < T::Size), typename select_best_vector_type_impl<N, Typelist...>::type,
        T>::type;
};
template <typename T, std::size_t N>
using select_best_vector_type =
    typename select_best_vector_type_impl<N,
#ifdef Vc_IMPL_AVX2
                                          Vc::AVX2::Vector<T>,
                                          Vc::SSE::Vector<T>,
                                          Vc::Scalar::Vector<T>
#elif defined(Vc_IMPL_AVX)
                                          Vc::AVX::Vector<T>,
                                          Vc::SSE::Vector<T>,
                                          Vc::Scalar::Vector<T>
#elif defined(Vc_IMPL_Scalar)
                                          Vc::Scalar::Vector<T>
#elif defined(Vc_IMPL_SSE)
                                          Vc::SSE::Vector<T>,
                                          Vc::Scalar::Vector<T>
#elif defined(Vc_IMPL_MIC)
                                          Vc::MIC::Vector<T>,
                                          Vc::Scalar::Vector<T>
#endif
                                          >::type;
//}}}
/// @}
}  // namespace Common

// === having SimdArray<T, N> in the Vc namespace leads to a ABI bug ===
//
// SimdArray<double, 4> can be { double[4] }, { __m128d[2] }, or { __m256d } even though the type
// is the same.
// The question is, what should SimdArray focus on?
// a) A type that makes interfacing between different implementations possible?
// b) Or a type that makes fixed size SIMD easier and efficient?
//
// a) can be achieved by using a union with T[N] as one member. But this may have more serious
// performance implications than only less efficient parameter passing (because compilers have a
// much harder time wrt. aliasing issues). Also alignment would need to be set to the sizeof in
// order to be compatible with targets with larger alignment requirements.
// But, the in-memory representation of masks is not portable. Thus, at the latest with AVX-512,
// there would be a problem with requiring SimdMaskArray<T, N> to be an ABI compatible type.
// AVX-512 uses one bit per boolean, whereas SSE/AVX use sizeof(T) Bytes per boolean. Conversion
// between the two representations is not a trivial operation. Therefore choosing one or the other
// representation will have a considerable impact for the targets that do not use this
// representation. Since the future probably belongs to one bit per boolean representation, I would
// go with that choice.
//
// b) requires that SimdArray<T, N> != SimdArray<T, N> if
// SimdArray<T, N>::vector_type != SimdArray<T, N>::vector_type
//
// Therefore use SimdArray<T, N, V>, where V follows from the above.
template <typename T, size_t N, typename V = Common::select_best_vector_type<T, N>,
          size_t Wt = V::Size  // this last parameter is only used for specialization of N
                               // == VectorSize
          >
class SimdArray;

template <typename T, size_t N, typename V = Common::select_best_vector_type<T, N>,
          size_t Wt = V::Size  // this last parameter is only used for specialization of N
                               // == VectorSize
          >
class SimdMaskArray;

/** \internal
 * Simple traits for SimdArray to easily access internal types of non-atomic SimdArray
 * types.
 */
template <typename T, std::size_t N> struct SimdArrayTraits {
    static constexpr std::size_t N0 = Common::left_size<N>();
    static constexpr std::size_t N1 = Common::right_size<N>();

    using storage_type0 = SimdArray<T, N0>;
    using storage_type1 = SimdArray<T, N1>;
};

template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
    SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
    SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
    const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
    const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;

template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;

namespace Traits
{
template <typename T, std::size_t N, typename V> struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : public std::true_type {};
template <typename T, std::size_t N, typename V> struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>> : public std::true_type {};

template <typename T, std::size_t N, typename VectorType, std::size_t M> struct is_simdarray_internal<SimdArray<T, N, VectorType, M>> : public std::true_type {};
template <typename T, std::size_t N, typename VectorType, std::size_t M> struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>> : public std::true_type {};
template <typename T, std::size_t N, typename V, std::size_t M> struct is_integral_internal      <SimdArray<T, N, V, M>, false> : public std::is_integral<T> {};
template <typename T, std::size_t N, typename V, std::size_t M> struct is_floating_point_internal<SimdArray<T, N, V, M>, false> : public std::is_floating_point<T> {};
template <typename T, std::size_t N, typename V, std::size_t M> struct is_signed_internal        <SimdArray<T, N, V, M>, false> : public std::is_signed<T> {};
template <typename T, std::size_t N, typename V, std::size_t M> struct is_unsigned_internal      <SimdArray<T, N, V, M>, false> : public std::is_unsigned<T> {};

template<typename T, std::size_t N> struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : public std::true_type {};
}  // namespace Traits

}  // namespace Vc

#endif  // VC_COMMON_SIMDARRAYFWD_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_LOADSTOREFLAGS_H_
#define VC_COMMON_LOADSTOREFLAGS_H_


namespace Vc_VERSIONED_NAMESPACE
{

/**
 * Hint for \ref Prefetch to select prefetches that mark the memory as exclusive.
 *
 * This hint may optimize the prefetch if the memory will subsequently be written to.
 */
struct Exclusive {};
/**
 * Hint for \ref Prefetch to select prefetches that mark the memory as shared.
 */
struct Shared {};

namespace LoadStoreFlags
{

struct StreamingFlag {};
struct UnalignedFlag {};
struct PrefetchFlagBase {};
#ifdef Vc_IMPL_MIC
template<size_t L1 = 8 * 64, size_t L2 = 64 * 64,
#else
// TODO: determine a good default for typical CPU use
template<size_t L1 = 16 * 64, size_t L2 = 128 * 64,
#endif
    typename ExclusiveOrShared_ = void> struct PrefetchFlag : public PrefetchFlagBase
{
    typedef ExclusiveOrShared_ ExclusiveOrShared;
    static constexpr size_t L1Stride = L1;
    static constexpr size_t L2Stride = L2;
    static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
    static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
};

template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
{
    typedef Default type;
};
template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
{
    typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
};

// ICC warns about the constexpr members in LoadStoreFlags: member "LoadStoreFlags<Flags...>::IsAligned" was declared but never referenced
// who needs that warning, especially if it was referenced...
// The warning cannot be reenabled because it gets emitted whenever the LoadStoreFlags is instantiated
// somewhere, so it could be anywhere.
#ifdef Vc_ICC
#pragma warning(disable: 177)
#endif
/**\internal
 * Implementation of the load/store flags mechanism. This is internal API. Only some
 * concrete aliases are API-relevant types.
 */
template<typename... Flags> struct LoadStoreFlags
{
private:
    // ICC doesn't grok this line:
    //template<typename Test> using TestFlag = std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>;
    typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;

public:
    constexpr LoadStoreFlags() {}

    static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
    static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
    static constexpr bool IsAligned = !IsUnaligned;
    static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
    static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
    static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
    static constexpr size_t L1Stride = Prefetch::L1Stride;
    static constexpr size_t L2Stride = Prefetch::L2Stride;

    typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;

    // The following EnableIf* convenience types cannot use enable_if because then no LoadStoreFlags type
    // could ever be instantiated. Instead these types are defined either as void* or void. The
    // function that does SFINAE then assigns "= nullptr" to this type. Thus, the ones with just
    // void result in substitution failure.
    typedef typename std::conditional<IsAligned   && !IsStreaming, void *, void>::type EnableIfAligned;
    typedef typename std::conditional<IsAligned   &&  IsStreaming, void *, void>::type EnableIfStreaming;
    typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
    typedef typename std::conditional<IsUnaligned &&  IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
    typedef typename std::conditional<IsUnaligned                , void *, void>::type EnableIfUnaligned;
    typedef typename std::conditional<!IsUnaligned               , void *, void>::type EnableIfNotUnaligned;
    typedef typename std::conditional<IsPrefetch                 , void *, void>::type EnableIfPrefetch;
    typedef typename std::conditional<!IsPrefetch                , void *, void>::type EnableIfNotPrefetch;
};

/**\internal
 * Specialization for no flags (i.e aligned, non-streaming, no prefetching)
 */
template<> struct LoadStoreFlags<>
{
    constexpr LoadStoreFlags() {}

    static constexpr bool IsStreaming = false;
    static constexpr bool IsUnaligned = false;
    static constexpr bool IsAligned = !IsUnaligned;
    static constexpr bool IsPrefetch = false;
    static constexpr bool IsExclusivePrefetch = false;
    static constexpr bool IsSharedPrefetch = false;
    static constexpr size_t L1Stride = 0;
    static constexpr size_t L2Stride = 0;
    typedef void* EnableIfAligned;
    typedef void* EnableIfNotUnaligned;
    typedef void* EnableIfNotPrefetch;
};

/**
 * Operator for concatenation of LoadStoreFlags.
 *
 * Example:
 * \code
 * float_v x(mem, Vc::Aligned | Vc::Streaming);
 * \endcode
 */
template<typename... LFlags, typename... RFlags>
constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
{
    return LoadStoreFlags<LFlags..., RFlags...>();
}

} // LoadStoreFlags namespace

using LoadStoreFlags::PrefetchFlag;

typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;

/// The default load tag type uses unaligned (non-streaming) loads.
typedef UnalignedTag DefaultLoadTag;
/// The default store tag type uses unaligned (non-streaming) stores.
typedef UnalignedTag DefaultStoreTag;

/**\addtogroup Utilities
 * @{
 */
/**
 * Use this object for a \p flags parameter to request aligned loads and stores.
 *
 * It specifies that a load/store can expect a memory address that is aligned on
 * the correct boundary. (i.e. \p MemoryAlignment)
 *
 * \warning
 * If you specify Aligned, but the memory address is not aligned the program
 * will most likely crash.
 */
constexpr AlignedTag Aligned;

/**
 * Use this object for a \p flags parameter to request unaligned loads and stores.
 *
 * It specifies that a load/store can \em not expect a memory address that is
 * aligned on the correct boundary. (i.e. alignment is less than
 * \p MemoryAlignment)
 *
 * \note
 * If you specify Unaligned, but the memory address is aligned the load/store
 * will execute slightly slower than necessary.
 */
constexpr UnalignedTag Unaligned;

/**
 * Use this object for a \p flags parameter to request streaming loads and stores.
 *
 * It specifies that the cache should be bypassed for the given load/store.
 * Whether this will actually be done depends on the target system's capabilities.
 *
 * Streaming stores can be interesting when the code calculates values that, after being
 * written to memory, will not be used for a long time or used by a different thread.
 *
 * \note
 * Expect that most target systems do not support unaligned streaming loads or stores.
 * Therefore, make sure that you also specify Aligned.
 */
constexpr StreamingTag Streaming;

/**
 * Use this object for a \p flags parameter to request default software prefetches to be
 * emitted.
 */
constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
///@}

/**
 * \tparam L1
 * \tparam L2
 * \tparam ExclusiveOrShared
 */
template <size_t L1 = PrefetchFlag<>::L1Stride,
          size_t L2 = PrefetchFlag<>::L2Stride,
          typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
{
};

namespace Traits
{
///\internal partial specialization for detecting LoadStoreFlags types
template <typename... Ts>
struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
{
};
///\internal partial specialization for detecting the derived Prefetch type as a
/// load/store flag.
template <size_t L1, size_t L2, typename ExclusiveOrShared>
struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
{
};
}  // namespace Traits
}  // namespace Vc

#endif // VC_COMMON_LOADSTOREFLAGS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_WRITEMASKEDVECTOR_H_
#define VC_COMMON_WRITEMASKEDVECTOR_H_

#include <utility>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

template <typename V, typename M = typename V::Mask> class WriteMaskedVector
{
    static_assert(
        V::Size == M::Size,
        "incorrect use of Vc::Common::WriteMaskedVector<V, M>. V and M must have the same «Size».");

public:
    typedef M Mask;
    static constexpr size_t Size = V::Size;

    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));

    // implicit (allows {vec, mask} in places where WriteMaskedVector is expected)
    Vc_INTRINSIC WriteMaskedVector(V &v, const Mask &k) : mask(k), vec(v)
    {
    }

    // prefix
    Vc_INTRINSIC V &operator++()
    {
        V one = V::One();
        one.setZeroInverted(mask);
        return vec += one;
    }
    Vc_INTRINSIC V &operator--()
    {
        V one = V::One();
        one.setZeroInverted(mask);
        return vec -= one;
    }

    // postfix
    Vc_INTRINSIC V operator++(int)
    {
        V ret(vec);
        operator++();
        return ret;
    }
    Vc_INTRINSIC V operator--(int)
    {
        V ret(vec);
        operator--();
        return ret;
    }

#define Vc_OPERATOR_(op)                                                                 \
    template <typename U> Vc_ALWAYS_INLINE void operator op##=(U &&x)                    \
    {                                                                                    \
        operator=(static_cast<V>(vec op std::forward<U>(x)));                            \
    }
    Vc_ALL_BINARY(Vc_OPERATOR_);
    Vc_ALL_ARITHMETICS(Vc_OPERATOR_);
    Vc_ALL_SHIFTS(Vc_OPERATOR_);
#undef Vc_OPERATOR_

    Vc_ALWAYS_INLINE void operator=(const V &x)
    {
        vec.assign(x, mask);
    }

    template <typename T, typename I, typename S>
    Vc_ALWAYS_INLINE void operator=(SubscriptOperation<T, I, S, true> &&x)
    {
        vec.gather(x.gatherArguments(), mask);
    }

    template <typename F> Vc_INTRINSIC void call(const F &f) const
    {
        return vec.call(f, mask);
    }
    template <typename F> Vc_INTRINSIC V apply(const F &f) const
    {
        return vec.apply(f, mask);
    }
    template <typename F> Vc_INTRINSIC void call(F &&f) const
    {
        return vec.call(std::forward<F>(f), mask);
    }
    template <typename F> Vc_INTRINSIC V apply(F &&f) const
    {
        return vec.apply(std::forward<F>(f), mask);
    }

private:
#ifdef Vc_ICC
    // If ICC gets a by-value copy of Mask here, it'll generate a lot of superfluous
    // stack-register copies.
    const Mask &mask;
#else
    // If Clang gets a const-ref Mask here, it'll miscompile some of the masked assignment
    // statements.
    const Mask mask;
#endif
    V &vec;
};
}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_WRITEMASKEDVECTOR_H_

namespace Vc_VERSIONED_NAMESPACE
{
/**
 * \ingroup Math
 * Copies the sign(s) of \p sign to the value(s) in \p magnitude and returns the resulting
 * vector.
 *
 * \param magnitude This vector's magnitude will be used in the return vector.
 * \param sign This vector's sign bit will be used in the return vector.
 *
 * \return a value where the sign of the value equals the sign of \p sign. I.e.
 * `sign(copysign(v, r)) == sign(r)`.
 */
template <typename T, typename Abi,
          typename = enable_if<std::is_floating_point<T>::value>>
inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);

/**
 * \ingroup Math
 * Extracts the exponent of each floating-point vector component.
 *
 * \param x The vector of values to check for the sign.
 * \return the exponent to base 2.
 *
 * This function provides efficient access to the exponent of the floating point number. The
 * returned value is a fast approximation to the logarithm of base 2. The absolute error of that
 * approximation is between [0, 1[.
 *
 * Examples:
\verbatim
 value | exponent | log2
=======|==========|=======
   1.0 |        0 | 0
   2.0 |        1 | 1
   3.0 |        1 | 1.585
   3.9 |        1 | 1.963
   4.0 |        2 | 2
   4.1 |        2 | 2.036
\endverbatim
 *
 * \warning This function assumes a positive value (non-zero). If the value is negative the sign bit will
 * modify the returned value. An input value of zero will return the bias of the floating-point
 * representation. If you compile with Vc runtime checks, the function will assert
 * values greater than or equal to zero.
 *
 * You may use abs to apply this function to negative values:
 * \code
 * exponent(abs(v))
 * \endcode
 */
template <typename T, typename Abi,
          typename = enable_if<std::is_floating_point<T>::value>>
inline Vector<T, Abi> exponent(Vector<T, Abi> x);

/**
 * \ingroup Math
 * Returns for each vector component whether it stores a negative value.
 *
 * \param x The vector of values to check for the sign.
 * \returns a mask which is \c true only in those components that are negative in \p x.
 */
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST typename Vector<T, Abi>::MaskType isnegative(Vector<T, Abi> x)
{
    return x < Vector<T, Abi>::Zero();
}

/**
 * \class Vector types.h <Vc/vector.h>
 * \ingroup Vectors
 *
 * The main vector class for expressing data parallelism.
 *
 * are specializations of this class.
 * For most cases there are no API differences for the specializations.
 * Make use of Vector<T> for generic programming, otherwise you might prefer to use
 * the \p *_v aliases.
 *
 * \see Vc::float_v, Vc::double_v, Vc::int_v, Vc::uint_v, Vc::short_v, Vc::ushort_v
 * \see Mask
 */
template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
{
public:
    /**
     * Returns the number of scalar components (\VSize{T}) in a vector of this type.
     *
     * The size of the vector. I.e. the number of scalar entries in the vector. Do not
     * make any assumptions about the size of vectors. If you need vectors of \c float and
     * \c int types use Vector::IndexType or SimdArray.
     *
     * You can easily use if clauses to compare Vector sizes. The compiler can
     * statically evaluate and fully optimize dead code away (very much like \#ifdef, but
     * with syntax checking).
     *
     * \returns The number of components (i.e. \VSize{T}) objects of this vector type
     * store and manipulate.
     */
    static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }

    /**
     * Specifies the alignment requirement for aligned load and store calls for objects of
     * this vector type.
     */
    static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();

    /// The ABI tag type of the current template instantiation.
    using abi = Abi;

    /// The type of the entries in the vector.
    using EntryType = typename VectorTraits<T, Abi>::EntryType;
    /// \copydoc EntryType
    using value_type = EntryType;

    using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
    /**\internal
     * This type reveals the implementation-specific type used for the data member.
     */
    using VectorType = typename VectorTraits<T, Abi>::VectorType;
    /**\internal
     * \copydoc VectorType
     */
    using vector_type = VectorType;

    /// The type of the mask used for masked operations and returned from comparisons.
    using MaskType = Vc::Mask<T, Abi>;
    /// \copydoc MaskType
    using mask_type = MaskType;

    using MaskArgument = MaskType;
    using VectorArgument = Vector;

    /// The type of the vector used for indexes in gather and scatter operations.
    using IndexType = Vc::SimdArray<int, VectorTraits<T, Abi>::size()>;
    /// \copydoc IndexType
    using index_type = IndexType;

    using reference = Detail::ElementReference<Vector>;

    /// \name Generators
    ///@{
    /**
     * Returns a vector with the entries initialized to zero.
     */
    static inline Vector Zero();

    /**
     * Returns a vector with the entries initialized to one.
     */
    static inline Vector One();

    /**
     * Returns a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
     */
    static inline Vector IndexesFromZero();

    /**
     * Returns a vector with pseudo-random entries.
     *
     * Currently the state of the random number generator cannot be modified and starts
     * off with the same state. Thus you will get the same sequence of numbers for the
     * same sequence of calls.
     *
     * \return a new random vector. Floating-point values will be in the 0-1 range.
     * Integers will use the full range the integer representation allows.
     *
     * \note This function may use a very small amount of state and thus will be a weak
     * random number generator.
     */
    static inline Vector Random();

    /// Generate a vector object from return values of \p gen (static variant of \ref fill).
    template <typename G> static inline Vector generate(G gen);
    ///@}

    /// \name Compile-Time Constant Initialization
    ///@{
    /**
     * Construct a zero-initialized vector object.
     *
     * This constructor follows the behavior of the underlying arithmetic type \p T in
     * that the expression `T()` zero-initializes the object. On the other hand the
     * variable \c x in `T x;` is uninitialized.
     * Since, for class types, both expressions call the default constructor `Vector<T> x`
     * must zero-initialize \c x as well.
     */
    inline Vector() = default;

    /**
     * Construct a vector with the entries initialized to zero.
     *
     * \see Vc::Zero, Zero()
     */
    explicit inline Vector(VectorSpecialInitializerZero);

    /**
     * Construct a vector with the entries initialized to one.
     *
     * \see Vc::One, One()
     */
    explicit inline Vector(VectorSpecialInitializerOne);

    /**
     * Construct a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
     *
     * \see Vc::IndexesFromZero, IndexesFromZero()
     */
    explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
    ///@}

    /// \name Conversion/Broadcast Constructors
    ///@{
    /**
     * Implict conversion from compatible Vector<U, Abi> types.
     */
    template <typename U>
    inline Vector(Vector<U, abi> x,
                  enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);

#if Vc_IS_VERSION_1
    /**
     * Explicit conversion (i.e. `static_cast`) from the remaining Vector<U, Abi> types.
     *
     * \param x A vector object to use for initialization of the new vector object. If \p
     *          x contains more entries than the new object the high components will be
     *          ignored. If \p x contains fewer entries than the new object the high
     *          components of the new object will be zero-initialized. Type conversion is
     *          done according to the standard conversion rules for the underlying
     *          fundamental arithmetic types.
     */
    template <typename U>
    Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                  "vector types") inline explicit Vector(
        Vector<U, abi> x,
        enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
#endif

    /**
     * Broadcast Constructor.
     *
     * Constructs a vector with all entries of the vector filled with the given value.
     *
     * \param a The scalar value to broadcast to all entries of the constructed vector.
     */
    inline Vector(EntryType a);
    template <typename U>
    inline Vector(U a, enable_if<std::is_same<U, int>::value &&
                                 !std::is_same<U, EntryType>::value> = nullarg);
    ///@}

    /**
     * \name Loads & Stores
     */
    ///@{
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

// load ctors{{{1
/**
 * Construct a vector from loading its entries from the array at \p mem.
 *
 * \param mem A pointer to data. The pointer must not be aligned on a
 *            MemoryAlignment boundary unless you add the Vc::Aligned flag as a second
 *            argument.
 */
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
    load(mem);
}
/**
 * Construct a vector from loading its entries from the array at \p mem.
 *
 * \param mem A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer
 *            must be aligned on a MemoryAlignment boundary.
 * \param flags A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming,
 *              Vc::Unaligned, and/or Vc::PrefetchDefault.
 */
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
    load(mem, flags);
}

template <typename U, typename Flags = DefaultLoadTag,
          typename = enable_if<
              (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
               sizeof(EntryType) >= sizeof(U)) &&
              std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
    load<U, Flags>(x, flags);
}

// load member functions{{{1
/**
 * Load the vector entries from \p mem, overwriting the previous values.
 *
 * \param mem
 * A pointer to data. The pointer must not be aligned on a MemoryAlignment boundary unless
 * you add the Vc::Aligned flag as a second argument.
 */
Vc_INTRINSIC void load(const EntryType *mem)
{
    load(mem, DefaultLoadTag());
}
/**
 * Load the vector entries from \p mem, overwriting the previous values.
 *
 * \param mem
 * A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer must be
 * aligned on a MemoryAlignment boundary.
 * \param flags
 * A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, Vc::Unaligned,
 * and/or Vc::PrefetchDefault.
 */
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
    load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
              (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
               sizeof(EntryType) >= sizeof(U)) &&
              std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};

public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
//}}}1

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

///////////////////////////////////////////////////////////////////////////////////////////
// stores

/**
 * Store the vector data to \p mem.
 *
 * \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
 * \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
 *              Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
 */
template <
    typename U,
    typename Flags = DefaultStoreTag,
    typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;

/**
 * Store the vector data to \p mem where \p mask is set.
 *
 * \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
 * \param mask A mask object that determines which entries of the vector should be stored
 *             to \p mem.
 * \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
 *              Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
 *
 * \note
 * The masked store does not pack the values into memory. I.e. the value at offset \c i
 * will be stored to `mem[i]`, independent of whether `mask[j]` for any `j < i` is \c
 * false.
 */
template <
    typename U,
    typename Flags = DefaultStoreTag,
    typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;

//@{
/**
 * The following store overloads support classes that have a cast operator to `EntryType
 * *`.
 */
Vc_INTRINSIC void store(EntryType *mem) const
{
    store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}

template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
    store<EntryType, Flags>(mem, flags);
}

Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
    store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}

template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
    store<EntryType, Flags>(mem, mask, flags);
}
//@}

// vim: foldmethod=marker
    ///@}

    /**
     * Set all entries to zero.
     */
    inline void setZero();

    /**
     * Set all entries to zero where the mask is set.
     *
     * A 4-vector with a mask of `[0111]` therefore would set the last three entries to 0.
     *
     * \param mask Selects the entries to be set to zero.
     */
    inline void setZero(MaskType mask);

    /**
     * Set all entries to zero where the mask is not set.
     *
     * A 4-vector with a mask of `[0111]` therefore would set only the first entry to 0.
     *
     * \param mask Selects the entries to not be set to zero.
     */
    inline void setZeroInverted(MaskType mask);

    /**
     * Set all entries to the bit representation of a QNaN.
     */
    inline void setQnan();

    /**
     * Set all entries to the bit representation of a QNaN where the mask is set.
     *
     * \param mask Selects the entries to be set to QNaN.
     */
    inline void setQnan(MaskType mask);

#define Vc_CURRENT_CLASS_NAME Vector
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif

///////////////////////////////////////////////////////////////////////////////////////////
// gathers
// A gather takes the following arguments:
// 1. A const pointer to memory of any type that can convert to EntryType
// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
//    stores «Size» valid index values, and each offset to the pointer above yields a valid
//    memory location for reading.
// 3. Optionally the third argument may be a mask. The mask disables several memory reads and
//    thus removes the requirements in (2.) for the disabled entries.

private:
    /**\internal
     * This function implements a gather given a pointer to memory \p mem and some
     * container object storing the gather \p indexes.
     *
     * \param mem This pointer must be aligned correctly for the type \p MT. This is the
     * natural behavior of C++, so this is typically the case.
     * \param indexes This object contains at least \VSize{T} indexes that denote the
     * offset in \p mem where the components for the current vector should be copied from.
     * The offset is not in Bytes, but in multiples of `sizeof(MT)`.
     */
    // enable_if<std::can_convert<MT, EntryType>::value &&
    // has_subscript_operator<IT>::value>
    template <typename MT, typename IT>
    inline void gatherImplementation(const MT *mem, const IT &indexes);

    /**\internal
     * This overload of the above function adds a \p mask argument to disable memory
     * accesses at the \p indexes offsets where \p mask is \c false.
     */
    template <typename MT, typename IT>
    inline void gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask);

    /**\internal
     * Overload for the case of C-arrays or %Vc vector objects.
     *
     * In this case the \p indexes parameter is usable without adjustment.
     *
     * \param indexes An object to be used for gather or scatter.
     * \returns Forwards the \p indexes parameter.
     */
    template <typename IT, typename = enable_if<std::is_pointer<IT>::value ||
                                                Traits::is_simd_vector<IT>::value>>
    static Vc_INTRINSIC const IT &adjustIndexParameter(const IT &indexes)
    {
        return indexes;
    }

    /**\internal
     * Overload for the case of a container that returns an lvalue reference from its
     * subscript operator.
     *
     * In this case the container is assumed to use contiguous storage and therefore the
     * \p indexes object is converted to a C-array interface.
     *
     * \param indexes An object to be used for gather or scatter.
     * \returns A pointer to the first object in the \p indexes container.
     */
    template <
        typename IT,
        typename = enable_if<
            !std::is_pointer<IT>::value && !Traits::is_simd_vector<IT>::value &&
            std::is_lvalue_reference<decltype(std::declval<const IT &>()[0])>::value>>
    static Vc_INTRINSIC decltype(std::addressof(std::declval<const IT &>()[0]))
    adjustIndexParameter(const IT &i)
    {
        return std::addressof(i[0]);
    }

    /**\internal
     * Overload for the case of a container that returns an rvalue from its
     * subscript operator.
     *
     * \param indexes An object to be used for gather or scatter.
     * \returns Forwards the \p indexes parameter.
     */
    template <typename IT>
    static Vc_INTRINSIC enable_if<
        !std::is_pointer<IT>::value && !Traits::is_simd_vector<IT>::value &&
            !std::is_lvalue_reference<decltype(std::declval<const IT &>()[0])>::value,
        IT>
    adjustIndexParameter(const IT &i)
    {
        return i;
    }

public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_                                                \
    static_assert(                                                                       \
        std::is_convertible<MT, EntryType>::value,                                       \
        "The memory pointer needs to point to a type that can be converted to the "      \
        "EntryType of this SIMD vector type.");                                          \
    static_assert(                                                                       \
        Vc::Traits::has_subscript_operator<IT>::value,                                   \
        "The indexes argument must be a type that implements the subscript operator.");  \
    static_assert(                                                                       \
        !Traits::is_simd_vector<IT>::value ||                                            \
            Traits::simd_vector_size<IT>::value >= Size,                                 \
        "If you use a SIMD vector for the indexes parameter, the index vector must "     \
        "have at least as many entries as this SIMD vector.");                           \
    static_assert(                                                                       \
        !std::is_array<T>::value ||                                                      \
            (std::rank<T>::value == 1 &&                                                 \
             (std::extent<T>::value == 0 || std::extent<T>::value >= Size)),             \
        "If you use a simple array for the indexes parameter, the array must have "      \
        "at least as many entries as this SIMD vector.")

    /**
     * \name Gather constructors and member functions
     *
     * Constructs or loads a vector from the objects at `mem[indexes[0]]`,
     * `mem[indexes[1]]`, `mem[indexes[2]]`, ...
     *
     * All gather functions optionally take a mask as last argument. In that case only the
     * entries that are selected in the mask are accessed in memory and copied to the
     * vector. This enables invalid indexes in the \p indexes vector if those are masked
     * off in \p mask.
     *
     * Gathers from structured data (AoS: arrays of struct) are possible via a special
     * subscript operator of the container (array). You can use \ref Vc::array and \ref
     * Vc::vector as drop-in replacements for \c std::array and \c std::vector. These
     * container classes contain the necessary subscript operator overload. Example:
     * \code
     * Vc::vector<float> data(100);
     * std::iota(data.begin(), data.end(), 0.f);  // fill with values 0, 1, 2, ...
     * auto indexes = float_v::IndexType::IndexesFromZero();
     * float_v gathered = data[indexes];  // gathered == [0, 1, 2, ...]
     * \endcode
     *
     * Alternatively, you can use Vc::Common::AdaptSubscriptOperator to extend a given
     * container class with the necessary subscript operator. Example:
     * \code
     * template <typename T, typename Allocator = std::allocator<T>>
     * using my_vector = Vc::Common::AdaptSubscriptOperator<std::vector<T, Allocator>>;
     * \endcode
     *
     * \param mem A pointer to memory which contains objects of type \p MT at the offsets
     *            given by \p indexes.
     * \param indexes A container/vector of offsets into \p mem.
     *                The type of \p indexes (\p IT) may either be a pointer to integers
     *                (C-array) or a vector of integers (preferrably IndexType).
     * \param mask If a mask is given, only the active entries will be copied from memory.
     *
     * \note If you use a masked gather constructor the masked-off entries of the vector
     * are zero-initilized.
     */
    ///@{

    /// Gather constructor
    template <typename MT, typename IT,
              typename = enable_if<Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
    {
        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
        gatherImplementation(mem, adjustIndexParameter(indexes));
    }

    /// Masked gather constructor
    template <typename MT, typename IT,
              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
                                       MaskArgument mask)
    {
        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
        gatherImplementation(mem, adjustIndexParameter(indexes), mask);
    }

    /// Gather function
    template <typename MT, typename IT,
              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
    {
        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
        gatherImplementation(mem, adjustIndexParameter(indexes));
    }

    /// Masked gather function
    template <typename MT, typename IT,
              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
    {
        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
        gatherImplementation(mem, adjustIndexParameter(indexes), mask);
    }
    ///@}

    /// \name Deprecated Members
    ///@{

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                           const EntryType S1::*member1,
                                                           IT indexes)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                           const EntryType S1::*member1,
                                                           IT indexes, MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1]
                   .gatherArguments(),
               mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                           const S2 S1::*member1,
                                                           const EntryType S2::*member2,
                                                           IT indexes)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1][member2]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                           const S2 S1::*member1,
                                                           const EntryType S2::*member2,
                                                           IT indexes, MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1][member2]
                   .gatherArguments(),
               mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED(
        "use the subscript operator to Vc::array or Vc::vector "
        "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                 const EntryType *const S1::*ptrMember1,
                                                 IT1 outerIndexes, IT2 innerIndexes)
    {
        gather(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                   array, outerIndexes)[ptrMember1][innerIndexes]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED(
        "use the subscript operator to Vc::array or Vc::vector "
        "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
                                                 const EntryType *const S1::*ptrMember1,
                                                 IT1 outerIndexes, IT2 innerIndexes,
                                                 MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                   array, outerIndexes)[ptrMember1][innerIndexes]
                   .gatherArguments(),
               mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array,
                                                 const EntryType S1::*member1, IT indexes)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array,
                                                 const EntryType S1::*member1,
                                                 IT indexes,
                                                 MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1]
                   .gatherArguments(),
               mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array, const S2 S1::*member1,
                                                 const EntryType S2::*member2, IT indexes)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1][member2]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array, const S2 S1::*member1,
                                                 const EntryType S2::*member2, IT indexes,
                                                 MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                   array, indexes)[member1][member2]
                   .gatherArguments(),
               mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array,
                                                 const EntryType *const S1::*ptrMember1,
                                                 IT1 outerIndexes, IT2 innerIndexes)
    {
        gather(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                   array, outerIndexes)[ptrMember1][innerIndexes]
                   .gatherArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void gather(const S1 *array,
                                                 const EntryType *const S1::*ptrMember1,
                                                 IT1 outerIndexes, IT2 innerIndexes,
                                                 MaskArgument mask)
    {
        gather(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                   array, outerIndexes)[ptrMember1][innerIndexes]
                   .gatherArguments(),
               mask);
    }
    ///@}

    /**\internal
     * \name Gather function to use from Vc::Common::subscript_operator
     *
     * \param args
     * \param mask
     */
    ///@{
    template <typename MT, typename IT>
    Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT> &args)
    {
        gather(args.address, adjustIndexParameter(args.indexes));
    }

    template <typename MT, typename IT>
    Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT> &args, MaskArgument mask)
    {
        gather(args.address, adjustIndexParameter(args.indexes), mask);
    }
    ///@}

#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

///////////////////////////////////////////////////////////////////////////////////////////
// scatters
// A scatter takes the following arguments:
// 1. A pointer to memory of any type that EntryType can convert to.
// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
//    stores «Size» valid index values, and each offset to the pointer above yields a valid
//    memory location for reading.
// 3. Optionally the third argument may be a mask. The mask disables several memory stores and
//    thus removes the requirements in (2.) for the disabled entries.

private:
    /**\internal
     * This function implements a scatter given a pointer to memory \p mem and some
     * container object storing the scatter \p indexes.
     *
     * \param mem This pointer must be aligned correctly for the type \p MT. This is the
     * natural behavior of C++, so this is typically the case.
     * \param indexes This object contains at least \VSize{T} indexes that denote the
     * offset in \p mem where the components for the current vector should be copied to.
     * The offset is not in Bytes, but in multiples of `sizeof(MT)`.
     */
    // enable_if<std::can_convert<MT, EntryType>::value && has_subscript_operator<IT>::value>
    template <typename MT, typename IT>
    inline void scatterImplementation(MT *mem, IT &&indexes) const;

    /**\internal
     * This overload of the above function adds a \p mask argument to disable memory
     * accesses at the \p indexes offsets where \p mask is \c false.
     */
    template <typename MT, typename IT>
    inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;

public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_                                               \
    static_assert(                                                                       \
        std::is_convertible<EntryType, MT>::value,                                       \
        "The memory pointer needs to point to a type that the EntryType of this "        \
        "SIMD vector type can be converted to.");                                        \
    static_assert(                                                                       \
        Vc::Traits::has_subscript_operator<IT>::value,                                   \
        "The indexes argument must be a type that implements the subscript operator.");  \
    static_assert(                                                                       \
        !Traits::is_simd_vector<IT>::value ||                                            \
            Traits::simd_vector_size<IT>::value >= Size,                                 \
        "If you use a SIMD vector for the indexes parameter, the index vector must "     \
        "have at least as many entries as this SIMD vector.");                           \
    static_assert(                                                                       \
        !std::is_array<T>::value ||                                                      \
            (std::rank<T>::value == 1 &&                                                 \
             (std::extent<T>::value == 0 || std::extent<T>::value >= Size)),             \
        "If you use a simple array for the indexes parameter, the array must have "      \
        "at least as many entries as this SIMD vector.")

    /**
     * \name Scatter functions
     *
     * Stores a vector to the objects at `mem[indexes[0]]`, `mem[indexes[1]]`,
     * `mem[indexes[2]]`, ...
     *
     * \param mem A pointer to memory which contains objects of type \p MT at the offsets
     *            given by \p indexes.
     * \param indexes
     * \param mask
     */
    ///@{

    /// Scatter function
    template <typename MT,
              typename IT,
              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
    {
        Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
        scatterImplementation(mem, std::forward<IT>(indexes));
    }

    /// Masked scatter function
    template <typename MT,
              typename IT,
              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
    Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
    {
        Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
        scatterImplementation(mem, std::forward<IT>(indexes), mask);
    }
    ///@}

    /// \name Deprecated Members
    ///@{

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, EntryType S1::*member1,
                                                  IT indexes) const
    {
        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                    array, indexes)[member1]
                    .scatterArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, EntryType S1::*member1,
                                                  IT indexes, MaskArgument mask) const
    {
        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                    array, indexes)[member1]
                    .scatterArguments(),
                mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, S2 S1::*member1,
                                                  EntryType S2::*member2,
                                                  IT indexes) const
    {
        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                    array, indexes)[member1][member2]
                    .scatterArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
     *                struct (i.e. array[i].*member1.*member2 is read).
     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
     *                to. The type of indexes can either be an integer vector or a type that supports
     *                operator[] access.
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename S2, typename IT>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, S2 S1::*member1,
                                                  EntryType S2::*member2, IT indexes,
                                                  MaskArgument mask) const
    {
        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
                    array, indexes)[member1][member2]
                    .scatterArguments(),
                mask);
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
                                                  IT1 outerIndexes,
                                                  IT2 innerIndexes) const
    {
        scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                    array, outerIndexes)[ptrMember1][innerIndexes]
                    .scatterArguments());
    }

    /**
     * \deprecated Use Vc::array or Vc::vector subscripting instead.
     *
     * \param array   A pointer into memory (without alignment restrictions).
     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
     *                (&(array->*member1))[i])
     * \param outerIndexes
     * \param innerIndexes
     * \param mask    If a mask is given only the active entries will be gathered/scattered.
     */
    template <typename S1, typename IT1, typename IT2>
    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
                  "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
                                                  IT1 outerIndexes, IT2 innerIndexes,
                                                  MaskArgument mask) const
    {
        scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
                    array, outerIndexes)[ptrMember1][innerIndexes]
                    .scatterArguments(),
                mask);
    }
    ///@}

    /**\internal
     * \name Scatter function to use from Vc::Common::subscript_operator
     *
     * \param args
     * \param mask
     */
    ///@{
    template <typename MT, typename IT>
    Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
    {
        scatter(args.address, args.indexes);
    }

    template <typename MT, typename IT>
    Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
    {
        scatter(args.address, args.indexes, mask);
    }
    ///@}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
#undef Vc_CURRENT_CLASS_NAME

    /// \name Scalar Subscript Operators
    ///@{
    /**
     * This operator can be used to modify scalar entries of the vector.
     *
     * \param index A value between 0 and Size. This value is not checked internally so
     *              you must make/be sure it is in range.
     *
     * \return a reference to the vector entry at the given \p index.
     *
     * \warning The use of this function may result in suboptimal performance. Please
     *          check whether you can find a more vector-friendly way to do what you
     *          intended.
     * \note the returned object models the concept of a reference and
     * as such it can exist longer than the data it is referencing.
     * \note to avoid lifetime issues, we strongly advice not to store
     * any reference objects.
     */
    inline reference operator[](size_t index) noexcept;
    /**
     * This operator can be used to read scalar entries of the vector.
     *
     * \param index A value between 0 and Size. This value is not checked internally so
     *              you must make/be sure it is in range.
     *
     * \return a copy of the vector entry at the given \p index.
     */
    inline EntryType operator[](size_t index) const noexcept;
    ///@}

    /// \name Unary Operators
    ///@{
    /**
     * Determine where the vector is null.
     *
     * \returns a mask which denotes the zero entries of this vector object.
     */
    inline MaskType operator!() const;

    /**
     * Inverts all bits.
     *
     * \returns a new vector which has all bits inverted. I.e. `v & ~v == 0`.
     *
     * \note This operator is only defined for integral types \p T.
     */
    inline Vector operator~() const;

    /// Returns a new vector object with all entries negated.
    inline Vector operator-() const;
    /// Returns a copy of the vector object.
    inline Vector operator+() const;
    ///@}

    /**
     * \name Increment and Decrement Operators
     * The increment and decrement operators apply the increment/decrement operation per
     * component.
     *
     * The semantics are equal to the semantics of the fundamental arithmetics type \p T.
     *
     * \note Over-/Underflow of signed integral types is undefined behavior and may
     * actually break your code.
     */
    ///@{
    inline Vector &operator++();  // prefix
    inline Vector operator++(int);  // postfix
    inline Vector &operator--();  // prefix
    inline Vector operator--(int);  // postfix
    ///@}

#define Vc_OP(symbol)                                                                    \
    inline Vc_PURE Vector operator symbol(const Vector &x) const;
    /**
     * \name Arithmetic Operations
     *
     * The arithmetic operations are implemented as component-wise
     * application of the operator on the two vector objects.
     *
     * Example:
     * \code
     * void foo(float_v a, float_v b) {
     *   const float_v product    = a * b;
     *   const float_v difference = a - b;
     *   a += b;
     *   auto quotient = a / b;
     *   auto modulo = static_cast<int_v>(a) % static_cast<int_v>(b);
     * }
     * \endcode
     *
     * \param x The vector to add, subtract, multiply, or divide by.
     * \returns A vector object of the same type with the components filled according to a
     *          component-wise application of the operator.
     *
     * \note If a signed integral vector operation overflows the result is undefined.
     * (which is in agreement to the behavior of the fundamental signed integral types in
     * C++)
     */
    ///@{
    Vc_ALL_ARITHMETICS(Vc_OP);
    ///@}

    /**
     * \name Binary Operations
     *
     * The binary operations are implemented as component-wise
     * application of the operator on the two vector objects.
     *
     * Example:
     * \code
     * void foo(int_v a, int_v b) {
     *   const int_v combined_bits = a | b;
     *   const int_v masked_bits = a & b;
     *   a ^= b;  // flipped bits
     * }
     * \endcode
     *
     * \returns A vector object of the same type with the components filled according to a
     *          component-wise application of the operator.
     */
    ///@{
    Vc_ALL_BINARY(Vc_OP);
    ///@}

    /**
     * \name Shift Operations
     *
     * The shift operations are implemented as component-wise
     * application of the operator on the two vector objects.
     *
     * Example:
     * \code
     * void foo(int_v a, int_v b) {
     *   const int_v right = a >> b;
     *   a <<= b;
     * }
     * \endcode
     *
     * \returns A vector object of the same type with the components filled according to a
     *          component-wise application of the operator.
     */
    ///@{
    Vc_ALL_SHIFTS(Vc_OP);
    ///@}
#undef Vc_OP

    /**
     * \name Comparisons
     *
     * All comparison operators return a mask object.
     *
     * Example:
     * \code
     * void foo(const float_v &a, const float_v &b) {
     *   const float_m mask = a < b;
     *   ...
     * }
     * \endcode
     *
     * \param x The vector to compare against.
     * \returns A mask object. Its components contain the boolean results of the
     *          component-wise compare operation.
     */
    ///@{
#define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
    Vc_ALL_COMPARES(Vc_CMP_OP);
#undef Vc_CMP_OP
    ///@}

    /**
     * Writemask the vector before an assignment.
     *
     * \param mask The writemask to be used.
     *
     * \return an object that can be used for any kind of masked assignment.
     *
     * The returned object is only to be used for assignments and should not be assigned
     * to a variable.
     *
     * Examples:
     * \code
     * float_v v = float_v::Zero();         // v  = [0, 0, 0, 0]
     * int_v v2 = int_v::IndexesFromZero(); // v2 = [0, 1, 2, 3]
     * v(v2 < 2) = 1.f;                     // v  = [1, 1, 0, 0]
     * v(v2 < 3) += 1.f;                    // v  = [2, 2, 1, 0]
     * ++v2(v < 1.f);                       // v2 = [0, 1, 2, 4]
     * \endcode
     */
    inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);

    /**
     * \name Horizontal Reduction Operations
     *
     * Horizontal operations can be used to reduce the values of a vector to a scalar
     * value.
     *
     * Example:
     * \code
     * void foo(const float_v &v) {
     *   float min = v.min(); // smallest value in v
     *   float sum = v.sum(); // sum of all values in v
     * }
     * \endcode
     */
    ///@{

    /// Returns the smallest entry in the vector.
    inline EntryType min() const;
    /// Returns the largest entry in the vector.
    inline EntryType max() const;
    /// Returns the product of all entries in the vector.
    inline EntryType product() const;
    /// Returns the sum of all entries in the vector.
    inline EntryType sum() const;
    /// Returns a vector containing the sum of all entries with smaller index.
    inline Vector partialSum() const;
    /// Returns the smallest entry of the vector components selected by \p mask.
    inline EntryType min(MaskType mask) const;
    /// Returns the largest entry of the vector components selected by \p mask.
    inline EntryType max(MaskType mask) const;
    /// Returns the product of the vector components selected by \p mask.
    inline EntryType product(MaskType mask) const;
    /// Returns the sum of the vector components selected by \p mask.
    inline EntryType sum(MaskType mask) const;
    ///@}

    /**
     * \name Shift and Rotate
     *
     * These functions allow to shift or rotate the entries in a vector.
     *
     * All functions with an \p amount parameter support positive and negative numbers for
     * the shift/rotate value.
     *
     * Example:
     * \code
     * using namespace Vc;
     * int_v foo = int_v::IndexesFromZero() + 1; // e.g. [1, 2, 3, 4] with SSE
     * int_v x;
     * x = foo.shifted( 1); // [2, 3, 4, 0]
     * x = foo.shifted( 2); // [3, 4, 0, 0]
     * x = foo.shifted( 3); // [4, 0, 0, 0]
     * x = foo.shifted( 4); // [0, 0, 0, 0]
     * x = foo.shifted(-1); // [0, 1, 2, 3]
     * x = foo.shifted(-2); // [0, 0, 1, 2]
     * x = foo.shifted(-3); // [0, 0, 0, 1]
     * x = foo.shifted(-4); // [0, 0, 0, 0]
     *
     * x = foo.rotated( 1); // [2, 3, 4, 1]
     * x = foo.rotated( 2); // [3, 4, 1, 2]
     * x = foo.rotated( 3); // [4, 1, 2, 3]
     * x = foo.rotated( 4); // [1, 2, 3, 4]
     * x = foo.rotated(-1); // [4, 1, 2, 3]
     * x = foo.rotated(-2); // [3, 4, 1, 2]
     * x = foo.rotated(-3); // [2, 3, 4, 1]
     * x = foo.rotated(-4); // [1, 2, 3, 4]
     * \endcode
     *
     * These functions are slightly related to the above swizzles. In any case, they are
     * often useful for communication between SIMD lanes or binary decoding operations.
     *
     * \warning Use of these functions leads to less portable code. Consider the scalar
     * implementation where every vector has only one entry. The shift and rotate
     * functions have no useful task to fulfil there and you will almost certainly not get
     * any useful results. It is recommended to add a static_assert for the assumed
     * minimum vector size.
     */
    ///@{

    /// Shift vector entries to the left by \p amount; shifting in zeros.
    inline Vector shifted(int amount) const;
    /**
     * Shift vector entries to the left by \p amount; shifting in values from shiftIn
     * (instead of zeros).
     *
     * This function can be used to create vectors from unaligned memory locations.
     *
     * Example:
     * \code
     * Vc::Memory<int_v, 256> mem;
     * for (int i = 0; i < 256; ++i) { mem[i] = i + 1; }
     * int_v a = mem.vectorAt(0);
     * int_v b = mem.vectorAt(int_v::Size);
     * int_v x = a.shifted(1, b);
     * // now x == mem.vectorAt(1, Vc::Unaligned)
     * \endcode
     *
     * \param amount  The number of entries to shift by. \p amount must be between \c
     *                -Size and \c Size, otherwise the result is undefined.
     * \param shiftIn The vector of values to shift in.
     * \return        A new vector with values from \p this and \p shiftIn concatenated
     *                and then shifted by \p amount.
     */
    inline Vector shifted(int amount, Vector shiftIn) const;
    /// Rotate vector entries to the left by \p amount.
    inline Vector rotated(int amount) const;
    /// Returns a vector with all components reversed.
    inline Vector reversed() const;
    ///@}

    /**
     * Return a sorted copy of the vector.
     *
     * \returns a sorted vector. The returned values are in ascending order:
       \verbatim
       v[0] <= v[1] <= v[2] <= v[3] ...
       \endverbatim
     *
     * \note If the vector contains NaNs the result is undefined.
     *
     * Example:
     * \code
     * int_v v = int_v::Random();
     * int_v s = v.sorted();
     * std::cout << v << '\n' << s << '\n';
     * \endcode
     *
     * With SSE the output would be:
     *
       \verbatim
       [1513634383, -963914658, 1763536262, -1285037745]
       [-1285037745, -963914658, 1513634383, 1763536262]
       \endverbatim
     *
     * With the Scalar implementation:
       \verbatim
       [1513634383]
       [1513634383]
       \endverbatim
     */
    inline Vector sorted() const;

    /*!
     * \name Apply/Call/Fill Functions
     *
     * There are still many situations where the code needs to switch from SIMD operations
     * to scalar execution. In this case you can, of course rely on operator[]. But there
     * are also a number of functions that can help with common patterns.
     *
     * The apply functions expect a function that returns a scalar value, i.e. a function
     * of the form "T f(T)".  The call functions do not return a value and thus the
     * function passed does not need a return value. The fill functions are used to
     * serially set the entries of the vector from the return values of a function.
     *
     * Example:
     * \code
     * void foo(float_v v) {
     *   float_v logarithm = v.apply(std::log);
     *   float_v exponential = v.apply(std::exp);
     * }
     * \endcode
     *
     * Of course, you can also use lambdas here:
     * \code
     *   float_v power = v.apply([](float f) { return std::pow(f, 0.6f); })
     * \endcode
     *
     * \param f A functor: this can either be a function or an object that implements
     * operator().
     */
    ///@{

    /// Call \p f sequentially, starting with the minimum up to the maximum value.
    template <typename F> void callWithValuesSorted(F &&f);
    /// Call \p f with the scalar entries of the vector.
    template <typename F> inline void call(F &&f) const;
    /// As above, but skip the entries where \p mask is not set.
    template <typename F> inline void call(F &&f, MaskType mask) const;

    /// Call \p f on every entry of the vector and return the results as a new vector.
    template <typename F> inline Vector apply(F &&f) const;
    /// As above, but skip the entries where \p mask is not set.
    template <typename F> inline Vector apply(F &&f, MaskType mask) const;

    /// Fill the vector with the values [f(0), f(1), f(2), ...].
    template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
    /// Fill the vector with the values [f(), f(), f(), ...].
    inline void fill(EntryType(&f)());
    ///@}

    /**\internal
     * Interleaves this vector and \p x and returns the resulting low vector.
     * Used to implement Vc::interleave.
     */
    inline Vector interleaveLow(Vector x) const;
    /**\internal
     * Interleaves this vector and \p x and returns the resulting low vector.
     * Used to implement Vc::interleave.
     */
    inline Vector interleaveHigh(Vector x) const;

    /**\internal
     * Assigns the components of \p v where \p m is \c true.
     */
    inline void assign(const Vector &v, const MaskType &m);

    /**
     * \internal
     * \name Internal Data Access
     * Returns a (const) reference the internal data member, storing the vector data.
     */
    ///@{
    inline VectorType &data();
    inline const VectorType &data() const;
    ///@}

    /// \name Deprecated Members
    ///@{

    /**
     * Returns the exponents of the floating-point values in the vector.
     *
     * \return A new vector object of the same type containing the exponents.
     *
     * \deprecated use Vc::exponent instead.
     */
    Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;

    /**
     * Returns whether a value is negative.
     *
     * \return A new mask object indicating the sign of each vector element.
     *
     * \deprecated use Vc::isnegative instead.
     */
    Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;

    ///\copydoc size
    ///\deprecated Use Vc::Vector::size instead.
    static constexpr size_t Size = VectorTraits<T, Abi>::size();

    /**
     * Casts the current object to \p V2.
     *
     * \returns a converted object of type \p Vc.
     *
     * \deprecated Use Vc::simd_cast instead.
     */
    template <typename V2> inline V2 staticCast() const;

    /**
     * reinterpret_cast the vector components to construct a vector of type \p V2.
     *
     * \returns An object of type \p V2 with the smae bit-representation.
     *
     * \deprecated use Vc::reinterpret_components_cast instead.
     */
    template <typename V2>
    Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
        reinterpretCast() const;

    /**
     * Copies the signs of the components of \p reference to the components of the current
     * vector, returning the result.
     *
     * \param reference A vector object that determines the sign of the the result.
     * \returns A new vector with sign taken from \p reference and absolute value taken
     * from the current vector object.
     *
     * \deprecated Use Vc::copysign instead.
     */
    Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
        copySign(Vector reference) const;
    ///@}

    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));

private:
    VectorType d;
};

/**
 * \ingroup Utilities
 * Constructs a new Vector object of type \p V from the Vector \p x, reinterpreting the
 * bits of \p x for the new type \p V.
 *
 * This function is only applicable if:
 * - the \c sizeof of the input and output types is equal
 * - the Vector::size() of the input and output types is equal
 * - the \c VectorEntryTypes of input and output have equal \c sizeof
 *
 * \tparam V The requested type to change \p x into.
 * \param x The Vector to reinterpret as an object of type \p V.
 * \returns A new object (rvalue) of type \p V.
 *
 * \warning This cast is non-portable since the applicability (see above) may change
 * depending on the default vector types of the target platform. The function is perfectly
 * safe to use with fully specified \p Abi, though.
 */
template <typename V, typename T, typename Abi>
Vc_ALWAYS_INLINE Vc_CONST enable_if<
    (V::size() == Vector<T, Abi>::size() &&
     sizeof(typename V::VectorEntryType) ==
         sizeof(typename Vector<T, Abi>::VectorEntryType) &&
     sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
    V>
reinterpret_components_cast(const Vector<T, Abi> &x)
{
    return reinterpret_cast<const V &>(x);
}

#define Vc_OP(symbol)                                                                    \
    template <typename T, typename Abi>                                                  \
    inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &,                          \
                                              const Vector<T, Abi> &x);
    //Vc_ALL_ARITHMETICS(Vc_OP);
    //Vc_ALL_BINARY(Vc_OP);
    //Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP

}  // namespace Vc

#endif  // VC_COMMON_VECTOR_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_MASK_H_
#define VC_COMMON_MASK_H_


namespace Vc_VERSIONED_NAMESPACE
{
/**
 * \class Mask mask.h <Vc/vector.h>
 * \ingroup Masks
 *
 * The main SIMD mask class.
 */
template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
{
public:
    /**
     * Returns the number of boolean components (\VSize{T}) in a mask of this type.
     *
     * The size of the mask. I.e. the number of boolean entries in the mask. Do not
     * make any assumptions about the size of masks.
     *
     * In addition, you can easily use if clauses that compare sizes. The compiler can
     * statically evaluate and fully optimize dead code away (very much like \#ifdef, but
     * with syntax checking).
     *
     * \returns The number of components (i.e. \VSize{T}) objects of this mask type store
     * and manipulate.
     */
    static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
    ///\copydoc size
    ///\deprecated Use Vc::Mask::size instead.
    static constexpr size_t Size = VectorTraits<T, Abi>::size();

    /**
     * Specifies the alignment requirement for aligned load and store calls for objects of
     * this mask type.
     */
    static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();

    /// The ABI tag type of the current template instantiation.
    using abi = Abi;

    /**
     * The \c EntryType of masks is always \c bool, independent of \c T.
     */
    using EntryType = bool;
    /// \copydoc EntryType
    using value_type = EntryType;

    /// The reference wrapper type used for accessing individual mask components.
    using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
    /// \copydoc EntryReference
    using value_reference = EntryReference;

    /**
     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
     * implementation.
     * This type is useful for the \c sizeof operator in generic functions.
     */
    using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;

    /**\internal
     * The \c VectorType reveals the implementation-specific internal type used for the SIMD type.
     */
    using VectorType = typename VectorTraits<T, Abi>::VectorType;
    /**\internal
     * \copydoc VectorType
     */
    using vector_type = VectorType;

    /*
     * The associated Vector<T> type.
     */
    //using Vector = Vector<T, Abi>;

    /// \name Generators
    ///@{
    /**
     * Creates a new mask object initialized to zero/\c false.
     *
     * \returns A mask object with zero-initialized components.
     */
    Vc_INTRINSIC static Mask Zero();

    /**
     * Creates a mask object initialized to one/\c true.
     *
     * \returns A mask object with components initialized to \c true.
     */
    Vc_INTRINSIC static Mask One();

    /// Generate a mask object from booleans returned from the function \p gen.
    template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
    ///@}

    /// \name Compile-Time Constant Initialization
    ///@{
    /**
     * Construct a zero-initialized vector object.
     *
     * This constructor follows the behavior of the underlying \c bool type in that the
     * expression `bool()` zero-initializes the object (to \c false). On the other hand
     * the variable \c x in `bool x;` is uninitialized.
     * Since, for class types, both expressions call the default constructor `Mask<T> x`
     * must zero-initialize \c x as well.
     */
    Vc_INTRINSIC Mask() = default;

    /// Zero-initialize the new mask object (\c false).
    /// \see Vc::Zero, Zero()
    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);

    /// Initialize the new mask object to one (\c true).
    /// \see Vc::One, One()
    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
    ///@}

    /// \name Conversion/Broadcast Constructors
    ///@{
    /**
     * Broadcast constructor.
     *
     * Set all components of the new mask object to \p b.
     *
     * \param b Determines the initial state of the mask.
     */
    Vc_INTRINSIC explicit Mask(bool b);

    /**
     * Implicit conversion from a compatible (equal \VSize{T} on every platform) mask
     * object.
     *
     * \param otherMask The mask to be converted.
     */
    template <typename U>
    Vc_INTRINSIC Mask(U &&otherMask,
                      Common::enable_if_mask_converts_implicitly<T, U> = nullarg);

#if Vc_IS_VERSION_1
    /**
     * Explicit conversion (static_cast) from a mask object that potentially has a
     * different \VSize{T}.
     *
     * \param otherMask The mask to be converted.
     *
     * \internal This is implemented via simd_cast in scalar/simd_cast_caller.h
     */
    template <typename U>
    Vc_DEPRECATED(
        "use simd_cast instead of explicit type casting to convert between mask types")
        Vc_INTRINSIC_L
        explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
                                         nullarg) Vc_INTRINSIC_R;
    ///@}
#endif

    /**
     * \name Loads & Stores
     */
    ///@{
    /**
     * Load constructor from an array of \c bool.
     *
     * This constructor implements an explicit conversion from an array of booleans to a
     * mask object. It corresponds to a Vector load constructor.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \see Mask(const bool *, Flags), load(const bool *)
     */
    Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
    /**
     * Overload of the above with a load/store flag argument.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
     * Vc::Unaligned, Vc::PrefetchDefault, ...
     * \see load(const bool *, Flags)
     */
    template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);

    /**
     * Load the components of the mask from an array of \c bool.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \see load(const bool *, Flags), Mask(const bool *)
     */
    Vc_ALWAYS_INLINE void load(const bool *mem);
    /**
     * Overload of the above with a load/store flag argument.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
     * Vc::Unaligned, Vc::PrefetchDefault, ...
     * \see Mask(const bool *, Flags)
     */
    template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);

    /**
     * Store the values of the mask to an array of \c bool.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \see store(bool *, Flags)
     */
    Vc_ALWAYS_INLINE void store(bool *mem) const;
    /**
     * Overload of the above with a load/store flag argument.
     *
     * \param mem A pointer to the start of the array of booleans.
     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
     * Vc::Unaligned, Vc::PrefetchDefault, ...
     */
    template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
    ///@}

    /// \name Comparison Operators
    ///@{
    /**
     * Returns whether the two masks are equal in all components.
     *
     * \param mask The other mask to compare against.
     * \returns A scalar boolean value that says whether all components of the two masks
     * are equal.
     *
     * \note If you expected a behavior similar to the compare operator of Vc::Vector,
     * consider that the bitwise operators already implement such functionality. There is
     * little use, typically, in having `a == b` return the same as `a ^ b`. In general,
     * it is more useful to query `all_of(a ^ b)` which is the same as this equality
     * operator.
     */
    Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;

    /**
     * Returns whether the two masks are different in at least one component.
     *
     * \param mask The other mask to compare against.
     * \returns A scalar boolean value that says whether at least one component of the two masks is different.
     *
     * \note `(a == b) == !(a != b)` holds
     * \see Mask::operator==(const Mask &)
     */
    Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
    ///@}

    /**
     * \name Logical and Binary Operators
     *
     * \brief Component-wise logical/binary operations on mask objects.
     *
     * The effect of logical and binary \c AND and \c OR is equivalent for mask types (as
     * it is for \c bool).
     */
    ///@{

    /// Returns the component-wise application of a logical \c AND to \p mask.
    Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
    /// Returns the component-wise application of a binary \c AND to \p mask.
    Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
    /// Returns the component-wise application of a logical \c OR to \p mask.
    Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
    /// Returns the component-wise application of a binary \c OR to \p mask.
    Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
    /// Returns the component-wise application of a binary \c XOR to \p mask.
    Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
    /// Returns a mask with inverted components.
    Vc_ALWAYS_INLINE Mask operator!() const;

    /// Modifies the mask using an \c AND operation with \p mask.
    Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
    /// Modifies the mask using an \c OR operation with \p mask.
    Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
    /// Modifies the mask using an \c XOR operation with \p mask.
    Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
    ///@}

    /**
     * \name Reductions
     *
     * \see any_of, all_of, none_of, some_of
     */
    ///@{

    /// Returns a logical \c AND of all components.
    Vc_ALWAYS_INLINE bool isFull() const;
    /// Returns a logical \c OR of all components.
    Vc_ALWAYS_INLINE bool isNotEmpty() const;
    /// Returns \c true if components are \c false, \c false otherwise.
    Vc_ALWAYS_INLINE bool isEmpty() const;
    /// Returns `!isFull() && !isEmpty()`.
    Vc_ALWAYS_INLINE bool isMix() const;
    ///@}

    /**\internal
     * \name Internal Data Access
     */
    ///@{
    Vc_ALWAYS_INLINE bool data() const;
    Vc_ALWAYS_INLINE bool dataI() const;
    Vc_ALWAYS_INLINE bool dataD() const;
    ///@}

    /// \name Scalar Subscript Operators
    ///@{
    /**
     * Lvalue-reference-like access to mask entries.
     *
     * \param index Determines the boolean to be accessed.
     * \return a temporary proxy object referencing the \p index th entry of the mask.
     *
     * \warning This operator does not return an lvalue reference (to \c bool), but rather
     * a temporary (rvalue) object that mimics an lvalue reference (as much as is possible
     * with C++11/14).
     */
    Vc_ALWAYS_INLINE EntryReference operator[](size_t index);

    /**
     * Read-only access to mask entries.
     *
     * \param index Determines the boolean to be accessed.
     * \return The \p index th entry of the mask as a \c bool (rvalue).
     *
     * \warning This operator does not return an lvalue reference (to `const bool`), but
     * rather a temporary (rvalue) \c bool.
     */
    Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
    ///@}

    /// Returns how many components of the mask are \c true.
    Vc_ALWAYS_INLINE int count() const;

    /**
     * Returns the index of the first one in the mask.
     *
     * \returns the index of the first component that is \c true.
     *
     * \warning The return value is undefined if the mask is empty.
     *
     * Thus, unless `none_of(mask)`, `mask[mask.firstOne()] == true` holds and `mask[i] ==
     * false` for all `i < mask.firstOne()`.
     */
    Vc_ALWAYS_INLINE int firstOne() const;

    /**
     * Convert the boolean components of the mask into bits of an integer.
     *
     * \return An \c int where each bit corresponds to the boolean value in the mask.
     *
     * For example, the mask `[true, false, false, true]` results in a `9` (in binary: `1001`).
     */
    Vc_ALWAYS_INLINE int toInt() const;

    /// Returns a mask with components shifted by \p amount places.
    Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;

    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));

private:
    VectorType d;
};

}  // namespace Vc

#endif  // VC_COMMON_MASK_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_MEMORYFWD_H_
#define VC_COMMON_MEMORYFWD_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
          bool InitPadding = true>
class Memory;

template <typename V, typename Parent, int Dimension, typename RowMemory>
class MemoryBase;
}  // namespace Common

using Common::Memory;
}  // namespace Vc

#endif // VC_COMMON_MEMORYFWD_H_

#endif // VC_COMMON_TYPES_H_

// vim: foldmethod=marker

#ifndef VC_SCALAR_TYPES_H_
#define VC_SCALAR_TYPES_H_

#ifdef Vc_DEFAULT_IMPL_Scalar
#define Vc_DOUBLE_V_SIZE 1
#define Vc_FLOAT_V_SIZE 1
#define Vc_INT_V_SIZE 1
#define Vc_UINT_V_SIZE 1
#define Vc_SHORT_V_SIZE 1
#define Vc_USHORT_V_SIZE 1
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace Scalar
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Scalar>;
typedef Vector<double>         double_v;
typedef Vector<float>           float_v;
typedef Vector<int>               int_v;
typedef Vector<unsigned int>     uint_v;
typedef Vector<short>           short_v;
typedef Vector<unsigned short> ushort_v;

template <typename T> using Mask = Vc::Mask<T, VectorAbi::Scalar>;
typedef Mask<double>         double_m;
typedef Mask<float>           float_m;
typedef Mask<int>               int_m;
typedef Mask<unsigned int>     uint_m;
typedef Mask<short>           short_m;
typedef Mask<unsigned short> ushort_m;

template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace Scalar

namespace Traits
{
template <typename T> struct is_simd_mask_internal<Scalar::Mask<T>> : public std::true_type {};
template <typename T> struct is_simd_vector_internal<Scalar::Vector<T>> : public std::true_type {};
}  // namespace Traits
}  // namespace Vc

#endif // VC_SCALAR_TYPES_H_
/*  This file is part of the Vc library. {{{
Copyright © 2016 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_DETAIL_H_
#define VC_SCALAR_DETAIL_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


#ifndef VC_SCALAR_MACROS_H_
#define VC_SCALAR_MACROS_H_

#endif // VC_SCALAR_MACROS_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
//InterleaveImpl{{{1
template<typename V, int Size, size_t VSize> struct InterleaveImpl;
template<typename V, size_t VSize> struct InterleaveImpl<V, 1, VSize> {
    template <typename I>  // interleave 2 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1)
    {
        data[i[0] + 0] = v0.data();
        data[i[0] + 1] = v1.data();
    }
    template <typename I>  // interleave 3 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2)
    {
        data[i[0] + 0] = v0.data();
        data[i[0] + 1] = v1.data();
        data[i[0] + 2] = v2.data();
    }
    template <typename I>  // interleave 4 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3)
    {
        data[i[0] + 0] = v0.data();
        data[i[0] + 1] = v1.data();
        data[i[0] + 2] = v2.data();
        data[i[0] + 3] = v3.data();
    }
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        data[i[0] + 4] = v4.data();
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template <typename I>  // deinterleave 2 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1)
    {
        v0.data() = data[i[0] + 0];
        v1.data() = data[i[0] + 1];
    }
    template <typename I>  // deinterleave 3 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2)
    {
        v0.data() = data[i[0] + 0];
        v1.data() = data[i[0] + 1];
        v2.data() = data[i[0] + 2];
    }
    template <typename I>  // deinterleave 4 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2, V &v3)
    {
        v0.data() = data[i[0] + 0];
        v1.data() = data[i[0] + 1];
        v2.data() = data[i[0] + 2];
        v3.data() = data[i[0] + 3];
    }
    template <typename I>  // deinterleave 5 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        v4.data() = data[i[0] + 4];
    }
    template <typename I>  // deinterleave 6 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }
    template <typename I>  // deinterleave 7 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // deinterleave 8 args{{{2
    static inline void deinterleave(typename V::EntryType const *const data, const I &i,
                                    V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6,
                                    V &v7)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6, v7);
    }
};
//}}}1
}  // namespace Detail
}  // namespace Vc

#endif  // VC_SCALAR_DETAIL_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_MASK_H_
#define VC_SCALAR_MASK_H_


namespace Vc_VERSIONED_NAMESPACE
{
template <typename T> class Mask<T, VectorAbi::Scalar>
{
    friend class Mask<  double, VectorAbi::Scalar>;
    friend class Mask<   float, VectorAbi::Scalar>;
    friend class Mask< int32_t, VectorAbi::Scalar>;
    friend class Mask<uint32_t, VectorAbi::Scalar>;
    friend class Mask< int16_t, VectorAbi::Scalar>;
    friend class Mask<uint16_t, VectorAbi::Scalar>;

public:
    using abi = VectorAbi::Scalar;

    static constexpr size_t Size = 1;
    static constexpr size_t MemoryAlignment = 1;
    static constexpr std::size_t size() { return 1; }

    /**
     * The \c EntryType of masks is always bool, independent of \c T.
     */
    typedef bool EntryType;
    using value_type = EntryType;

    using EntryReference = Vc::Detail::ElementReference<Mask>;
    using reference = EntryReference;

    /**
     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
     * implementation. This type is useful for the \c sizeof operator in generic functions.
     */
    typedef bool VectorEntryType;

    /**
     * The \c VectorType reveals the implementation-specific internal type used for the SIMD type.
     */
    using VectorType = bool;

    /**
     * The associated Vector<T> type.
     */
    using Vector = Scalar::Vector<T>;

    Vc_INTRINSIC Mask() = default;
    Vc_INTRINSIC explicit Mask(bool b) : m(b) {}
    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : m(false) {}
    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : m(true) {}
    Vc_INTRINSIC static Mask Zero() { return Mask(false); }
    Vc_INTRINSIC static Mask One() { return Mask(true); }

    // implicit cast
    template <typename U>
    Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly<T, U> = nullarg)
        : m(rhs.m) {}

#if Vc_IS_VERSION_1
    // explicit cast, implemented via simd_cast (in scalar/simd_cast_caller.h)
    template <typename U>
    Vc_DEPRECATED(
        "use simd_cast instead of explicit type casting to convert between mask types")
        Vc_INTRINSIC_L
        explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U> = nullarg)
            Vc_INTRINSIC_R;
#endif

        Vc_ALWAYS_INLINE explicit Mask(const bool *mem) : m(mem[0]) {}
        template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags) : m(mem[0]) {}

        Vc_ALWAYS_INLINE void load(const bool *mem) { m = mem[0]; }
        template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { m = mem[0]; }

        Vc_ALWAYS_INLINE void store(bool *mem) const { *mem = m; }
        template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { *mem = m; }

        Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return m == rhs.m; }
        Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return m != rhs.m; }

        Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); }
        Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); }
        Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); }
        Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); }
        Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^  rhs.m); }
        Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); }

        Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; }
        Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; }
        Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; }

        Vc_ALWAYS_INLINE bool isFull () const { return  m; }
        Vc_ALWAYS_INLINE bool isNotEmpty() const { return m; }
        Vc_ALWAYS_INLINE bool isEmpty() const { return !m; }
        Vc_ALWAYS_INLINE bool isMix  () const { return false; }

        Vc_ALWAYS_INLINE bool data () const { return m; }
        Vc_ALWAYS_INLINE bool dataI() const { return m; }
        Vc_ALWAYS_INLINE bool dataD() const { return m; }

private:
    friend reference;
    static Vc_INTRINSIC bool get(const Mask &o, int) noexcept { return o.m; }
    template <typename U>
    static Vc_INTRINSIC void set(Mask &o, int, U &&v) noexcept(
        noexcept(std::declval<bool &>() = std::declval<U>()))
    {
        o.m = std::forward<U>(v);
    }

public:
    /**
     * \note the returned object models the concept of a reference and
     * as such it can exist longer than the data it is referencing.
     * \note to avoid lifetime issues, we strongly advice not to store
     * any reference objects.
     */
    Vc_ALWAYS_INLINE reference operator[](size_t i) noexcept
    {
        Vc_ASSERT(i == 0); if (i) {}
        return {*this, 0};
    }
    Vc_ALWAYS_INLINE value_type operator[](size_t i) const noexcept
    {
        Vc_ASSERT(i == 0); if (i) {}
        return m;
    }

        Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; }

        /**
         * Returns the index of the first one in the mask.
         *
         * The return value is undefined if the mask is empty.
         */
        Vc_ALWAYS_INLINE int firstOne() const { return 0; }
        Vc_ALWAYS_INLINE int toInt() const { return m ? 1 : 0; }

        template <typename G> static Vc_INTRINSIC Mask generate(G &&gen)
        {
            return Mask(gen(0));
        }

        Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const
        {
            if (amount == 0) {
                return *this;
            } else {
                return Zero();
            }
        }

    private:
        bool m;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::MemoryAlignment;

}  // namespace Vc

#endif // VC_SCALAR_MASK_H_


namespace Vc_VERSIONED_NAMESPACE
{
#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Scalar>
{
    static_assert(std::is_arithmetic<T>::value,
                  "Vector<T> only accepts arithmetic builtin types as template parameter T.");

    public:
        using abi = VectorAbi::Scalar;
        using EntryType = typename Common::ensure_alignment_equals_sizeof<T>::type;
        using VectorEntryType = EntryType;
        using value_type = EntryType;
        using VectorType = EntryType;
        using vector_type = VectorType;
        using reference = Detail::ElementReference<Vector>;

    protected:
        VectorType m_data = VectorType();
        template <typename U> using V = Vector<U, abi>;

    public:
        typedef Scalar::Mask<T> Mask;
        using MaskType = Mask;
        using mask_type = Mask;
        typedef Mask MaskArgument;
        typedef Vector AsArg;

        Vc_ALWAYS_INLINE VectorType &data() { return m_data; }
        Vc_ALWAYS_INLINE const VectorType &data() const { return m_data; }

        static constexpr size_t Size = 1;
        static constexpr size_t MemoryAlignment = alignof(VectorType);
        typedef SimdArray<int, Size, Scalar::int_v, 1> IndexType;

/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

public:
    ///////////////////////////////////////////////////////////////////////////
    // init to zero
    Vector() = default;

    ///////////////////////////////////////////////////////////////////////////
    // types

    ///////////////////////////////////////////////////////////////////////////
    // constants
    static constexpr std::size_t size() { return Size; }

    ///////////////////////////////////////////////////////////////////////////
    // constant Vectors
    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
    static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
    static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
    static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
    {
        return Vector(Vc::IndexesFromZero);
    }

// vim: foldmethod=marker

        static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;

        // implict conversion from compatible Vector<U, abi>
        template <typename U>
        Vc_INTRINSIC Vector(
            V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
                                            void *>::type = nullptr)
            : m_data(static_cast<EntryType>(x.data()))
        {
        }

#if Vc_IS_VERSION_1
        // static_cast from the remaining Vector<U, abi>
        template <typename U>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "vector types") Vc_INTRINSIC
            explicit Vector(
                V<U> x,
                typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
                                        void *>::type = nullptr)
            : m_data(static_cast<EntryType>(x.data()))
        {
        }
#endif

        ///////////////////////////////////////////////////////////////////////////////////////////
        // broadcast
        Vc_INTRINSIC Vector(EntryType a) : m_data(a) {}
        template <typename U>
        Vc_INTRINSIC Vector(U a,
                            typename std::enable_if<std::is_same<U, int>::value &&
                                                        !std::is_same<U, EntryType>::value,
                                                    void *>::type = nullptr)
            : Vector(static_cast<EntryType>(a))
        {
        }


        ///////////////////////////////////////////////////////////////////////////////////////////
        // zeroing
        Vc_ALWAYS_INLINE void setZero() { m_data = 0; }
        Vc_ALWAYS_INLINE void setZero(Mask k) { if (k.data()) m_data = 0; }
        Vc_ALWAYS_INLINE void setZeroInverted(Mask k) { if (!k.data()) m_data = 0; }

        Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R;


        //prefix
        Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; }
        Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; }
        //postfix
        Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; }
        Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; }

    private:
        friend reference;
        Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
        {
            Vc_ASSERT(i == 0); if (i) {}
            return o.m_data;
        }
        template <typename U>
        Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
            noexcept(std::declval<value_type &>() = v))
        {
            Vc_ASSERT(i == 0); if (i) {}
            o.m_data = v;
        }

    public:
        /**
         * \note the returned object models the concept of a reference and
         * as such it can exist longer than the data it is referencing.
         * \note to avoid lifetime issues, we strongly advice not to store
         * any reference objects.
         */
        Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
        {
            static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
            return {*this, int(index)};
        }
        Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
        {
            Vc_ASSERT(index == 0); if (index) {}
            return m_data;
        }

        Vc_ALWAYS_INLINE Mask operator!() const
        {
            return Mask(!m_data);
        }
        Vc_ALWAYS_INLINE Vector operator~() const
        {
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
            static_assert(std::is_integral<T>::value, "bit-complement can only be used with Vectors of integral type");
#endif
            return Vector(~m_data);
        }

        Vc_ALWAYS_INLINE Vector operator-() const
        {
            return -m_data;
        }
        Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; }

#define Vc_OP(symbol) \
        Vc_ALWAYS_INLINE Vc_PURE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); }
        Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP

        Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
            isNegative() const
        {
            return Vc::isnegative(*this);
        }

        Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) {
          if (m.data()) m_data = v.m_data;
        }

        template <typename V2>
        Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
            staticCast() const
        {
            return V2(static_cast<typename V2::EntryType>(m_data));
        }
        template <typename V2>
        Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
            reinterpretCast() const
        {
            typedef typename V2::EntryType AliasT2 Vc_MAY_ALIAS;
            return V2(*reinterpret_cast<const AliasT2 *>(&m_data));
        }

        Vc_ALWAYS_INLINE Common::WriteMaskedVector<Vector, Mask> operator()(Mask m)
        {
            return {*this, m};
        }

        Vc_ALWAYS_INLINE EntryType min() const { return m_data; }
        Vc_ALWAYS_INLINE EntryType max() const { return m_data; }
        Vc_ALWAYS_INLINE EntryType product() const { return m_data; }
        Vc_ALWAYS_INLINE EntryType sum() const { return m_data; }
        Vc_ALWAYS_INLINE Vector partialSum() const { return *this; }
        Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; }
        Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; }
        Vc_ALWAYS_INLINE EntryType product(Mask m) const
        {
            if (m.data()) {
                return m_data;
            } else {
                return EntryType(1);
            }
        }
        Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m.data()) return m_data; return static_cast<EntryType>(0); }

        Vc_INTRINSIC Vector Vc_VDECL shifted(int amount, Vector shiftIn) const {
            Vc_ASSERT(amount >= -1 && amount <= 1);
            return amount == 0 ? *this : shiftIn;
        }
        Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); }
        Vc_INTRINSIC Vector rotated(int) const { return *this; }
        Vc_INTRINSIC Vector reversed() const { return *this; }
        Vc_INTRINSIC Vector sorted() const { return *this; }

        template <typename F> void callWithValuesSorted(F &&f) { f(m_data); }

        template <typename F> Vc_INTRINSIC void call(F &&f) const { f(m_data); }

        template <typename F> Vc_INTRINSIC void call(F &&f, Mask mask) const
        {
            if (mask.data()) {
                f(m_data);
            }
        }

        template <typename F> Vc_INTRINSIC Vector apply(F &&f) const { return Vector(f(m_data)); }

        template <typename F> Vc_INTRINSIC Vector apply(F &&f, Mask mask) const
        {
            if (mask.data()) {
                return Vector(f(m_data));
            } else {
                return *this;
            }
        }

        template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
            m_data = f(0);
        }
        Vc_INTRINSIC void fill(EntryType (&f)()) {
            m_data = f();
        }

        template <typename G> static Vc_INTRINSIC Vector generate(G gen)
        {
            return gen(0);
        }

        Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector Vc_VDECL
            copySign(Vector reference) const
        {
            return Vc::copysign(*this, reference);
        }

        Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
        {
            return Vc::exponent(*this);
        }

        Vc_INTRINSIC Vector Vc_VDECL interleaveLow(Vector) const { return *this; }
        Vc_INTRINSIC Vector Vc_VDECL interleaveHigh(Vector x) const { return x; }
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::MemoryAlignment;

#define Vc_OP(symbol)                                                                    \
    template <typename T, typename U,                                                    \
              typename = decltype(std::declval<T &>() symbol## = std::declval<T>())>     \
    Vc_INTRINSIC enable_if<std::is_convertible<U, Vector<T, VectorAbi::Scalar>>::value,  \
                           Vector<T, VectorAbi::Scalar>>                                 \
        &operator symbol##=(Vector<T, VectorAbi::Scalar> &lhs, U &&rhs)                  \
    {                                                                                    \
        lhs.data() symbol## = Vector<T, VectorAbi::Scalar>(std::forward<U>(rhs)).data(); \
        return lhs;                                                                      \
    }
Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP

#define Vc_CONDITIONAL_ASSIGN(name_, op_)                                                \
    template <Operator O, typename T, typename M, typename U>                            \
    Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign(               \
        Vector<T, VectorAbi::Scalar> &lhs, M &&mask, U &&rhs)                            \
    {                                                                                    \
        if (mask.isFull()) {                                                             \
            lhs op_ std::forward<U>(rhs);                                                \
        }                                                                                \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(          Assign,  =);
Vc_CONDITIONAL_ASSIGN(      PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN(     MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN(  MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN(    DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN(       XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN(       AndAssign, &=);
Vc_CONDITIONAL_ASSIGN(        OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN

#define Vc_CONDITIONAL_ASSIGN(name_, expr_)                                              \
    template <Operator O, typename T, typename M>                                        \
    Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Scalar>>           \
    conditional_assign(Vector<T, VectorAbi::Scalar> &lhs, M &&mask)                      \
    {                                                                                    \
        return mask.isFull() ? (expr_) : lhs;                                            \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs);
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs);
#undef Vc_CONDITIONAL_ASSIGN

}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#include <cmath>
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_CONST_DATA_H_
#define VC_COMMON_CONST_DATA_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

alignas(64) extern unsigned int RandomState[];
alignas(32) extern const unsigned int AllBitsSet[8];

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_CONST_DATA_H_
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_WHERE_H_
#define VC_COMMON_WHERE_H_


namespace Vc_VERSIONED_NAMESPACE
{

namespace WhereImpl
{

    /** \internal
     * The default implementation covers Vc::Mask types and any \p _LValue type that implements an
     * overload for the Vc::conditional_assign function.
     */
    template<typename _Mask, typename _LValue> struct MaskedLValue
    {
        typedef _Mask Mask;
        typedef _LValue LValue;

        const Mask &mask;
        LValue &lhs;

        // the ctors must be present, otherwise GCC fails to warn for Vc_WARN_UNUSED_RESULT
        constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
        MaskedLValue(const MaskedLValue &) = delete;
        constexpr MaskedLValue(MaskedLValue &&) = default;

        /* It is intentional that the assignment operators return void: When a bool is used for the
         * mask the code might get skipped completely, thus nothing can be returned. This would be
         * like requiring an if statement to return a value.
         */
        template<typename T> Vc_ALWAYS_INLINE void operator  =(T &&rhs) { conditional_assign<Operator::          Assign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { conditional_assign<Operator::      PlusAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { conditional_assign<Operator::     MinusAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { conditional_assign<Operator::  MultiplyAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { conditional_assign<Operator::    DivideAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { conditional_assign<Operator:: RemainderAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { conditional_assign<Operator::       XorAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { conditional_assign<Operator::       AndAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { conditional_assign<Operator::        OrAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { conditional_assign<Operator:: LeftShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
        template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { conditional_assign<Operator::RightShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
        Vc_ALWAYS_INLINE void operator++()    { conditional_assign<Operator:: PreIncrement>(lhs, mask); }
        Vc_ALWAYS_INLINE void operator++(int) { conditional_assign<Operator::PostIncrement>(lhs, mask); }
        Vc_ALWAYS_INLINE void operator--()    { conditional_assign<Operator:: PreDecrement>(lhs, mask); }
        Vc_ALWAYS_INLINE void operator--(int) { conditional_assign<Operator::PostDecrement>(lhs, mask); }
    };

    template <typename _Mask, typename T_, typename I_, typename S_>
    struct MaskedLValue<_Mask, Common::SubscriptOperation<T_, I_, S_, true>>
    {
        typedef _Mask Mask;
        typedef Common::SubscriptOperation<T_, I_, S_, true> SO;

        const Mask &mask;
        const SO lhs;

        template <typename T> using Decay = typename std::decay<T>::type;

        // the ctors must be present, otherwise GCC fails to warn for Vc_WARN_UNUSED_RESULT
        constexpr MaskedLValue(const Mask &m, SO &&l) : mask(m), lhs(l) {}
        MaskedLValue(const MaskedLValue &) = delete;
        constexpr MaskedLValue(MaskedLValue &&) = default;

        /* It is intentional that the assignment operators return void: When a bool is used for the
         * mask the code might get skipped completely, thus nothing can be returned. This would be
         * like requiring an if statement to return a value.
         */
        template<typename T> Vc_ALWAYS_INLINE void operator  =(T &&rhs) { std::forward<T>(rhs).scatter(lhs.scatterArguments(), mask); }
        /*
         * The following operators maybe make some sense. But only if implemented directly on the
         * scalar objects in memory. Thus, the user is probably better of with a manual loop.
         *
         * If implemented the operators would need to do a masked gather, one operation, and a
         * masked scatter. There is no way this is going to be efficient.
         *
        template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  + std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  - std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  * std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  / std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  % std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  ^ std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  & std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask)  | std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask) << std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { (Decay<T>(lhs.gatherArguments(), mask) >> std::forward<T>(rhs)).scatter(lhs.scatterArguments(), mask); }
        Vc_ALWAYS_INLINE void operator++()    { ++lhs(mask); }
        Vc_ALWAYS_INLINE void operator++(int) { lhs(mask)++; }
        Vc_ALWAYS_INLINE void operator--()    { --lhs(mask); }
        Vc_ALWAYS_INLINE void operator--(int) { lhs(mask)--; }
        */
    };

    template<typename _LValue> struct MaskedLValue<bool, _LValue>
    {
        typedef bool Mask;
        typedef _LValue LValue;

        const Mask &mask;
        LValue &lhs;

        // the ctors must be present, otherwise GCC fails to warn for Vc_WARN_UNUSED_RESULT
        constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
        MaskedLValue(const MaskedLValue &) = delete;
        constexpr MaskedLValue(MaskedLValue &&) = default;

        template<typename T> Vc_ALWAYS_INLINE void operator  =(T &&rhs) { if (mask) lhs   = std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { if (mask) lhs  += std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { if (mask) lhs  -= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { if (mask) lhs  *= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { if (mask) lhs  /= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { if (mask) lhs  %= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { if (mask) lhs  ^= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { if (mask) lhs  &= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { if (mask) lhs  |= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { if (mask) lhs <<= std::forward<T>(rhs); }
        template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { if (mask) lhs >>= std::forward<T>(rhs); }
        Vc_ALWAYS_INLINE void operator++()    { if (mask) ++lhs; }
        Vc_ALWAYS_INLINE void operator++(int) { if (mask) lhs++; }
        Vc_ALWAYS_INLINE void operator--()    { if (mask) --lhs; }
        Vc_ALWAYS_INLINE void operator--(int) { if (mask) lhs--; }
    };

    template<typename _Mask> struct WhereMask
    {
        typedef _Mask Mask;
        const Mask &mask;

        // the ctors must be present, otherwise GCC fails to warn for Vc_WARN_UNUSED_RESULT
        constexpr WhereMask(const Mask &m) : mask(m) {}
        WhereMask(const WhereMask &) = delete;

        template <typename T, typename I, typename S>
        constexpr Vc_WARN_UNUSED_RESULT
            MaskedLValue<Mask, Common::SubscriptOperation<T, I, S, true>>
            operator|(Common::SubscriptOperation<T, I, S, true> &&lhs) const
        {
            static_assert(!std::is_const<T>::value,
                          "masked scatter to constant memory not possible.");
            return {mask, std::move(lhs)};
        }

        template<typename T> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator|(T &&lhs) const
        {
            static_assert(std::is_lvalue_reference<T>::value, "Syntax error: Incorrect use of Vc::where. Maybe operator precedence got you by surprise. Examples of correct usage:\n"
                    "  Vc::where(x < 2) | x += 1;\n"
                    "  (Vc::where(x < 2) | x)++;\n"
                    "  Vc::where(x < 2)(x) += 1;\n"
                    "  Vc::where(x < 2)(x)++;\n"
                    );
            return { mask, lhs };
        }

        template<typename T> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator()(T &&lhs) const
        {
            return operator|(std::forward<T>(lhs));
        }
    };
}  // namespace WhereImpl

/**
 * \ingroup Utilities
 *
 * Conditional assignment.
 *
 * Since compares between SIMD vectors do not return a single boolean, but rather a vector of
 * booleans (mask), one often cannot use if / else statements. Instead, one needs to state
 * that only a subset of entries of a given SIMD vector should be modified. The \c where function
 * can be prepended to any assignment operation to execute a masked assignment.
 *
 * \param mask The mask that selects the entries in the target vector that will be modified.
 *
 * \return This function returns an opaque object that binds to the left operand of an assignment
 * via the binary-or operator or the functor operator. (i.e. either <code>where(mask) | x = y</code>
 * or <code>where(mask)(x) = y</code>)
 *
 * Example:
 * \code
 * template<typename T> void f1(T &x, T &y)
 * {
 *   if (x < 2) {
 *     x *= y;
 *     y += 2;
 *   }
 * }
 * template<typename T> void f2(T &x, T &y)
 * {
 *   where(x < 2) | x *= y;
 *   where(x < 2) | y += 2;
 * }
 * \endcode
 * The block following the if statement in \c f1 will be executed if <code>x &lt; 2</code> evaluates
 * to \c true. If \c T is a scalar type you normally get what you expect. But if \c T is a SIMD
 * vector type, the comparison will use the implicit conversion from a mask to bool, meaning
 * <code>all_of(x &lt; 2)</code>.
 *
 * Most of the time the required operation is a masked assignment as stated in \c f2.
 *
 */
template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> where(const M &mask)
{
    return { mask };
}

template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> _if(const M &m)
{
    return { m };
}

}  // namespace Vc

#endif // VC_COMMON_WHERE_H_
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_TRANSPOSE_H_
#define VC_COMMON_TRANSPOSE_H_

#include <tuple>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename... Inputs> struct TransposeProxy
{
    TransposeProxy(const Inputs &... inputs) : in{inputs...} {}

    std::tuple<const Inputs &...> in;
};

template <int LhsLength, size_t RhsLength> struct TransposeTag {
};
}  // namespace Common

template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
{
    return {vs...};
}
}  // namespace Vc

#endif  // VC_COMMON_TRANSPOSE_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_OPERATORS_H_
#define VC_SCALAR_OPERATORS_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// compare operators {{{1
#define Vc_OP(op_)                                                                       \
    template <typename T>                                                                \
    Vc_INTRINSIC Scalar::Mask<T> operator op_(Scalar::Vector<T> a, Scalar::Vector<T> b)  \
    {                                                                                    \
        return Scalar::Mask<T>(a.data() op_ b.data());                                   \
    }
Vc_ALL_COMPARES(Vc_OP);
#undef Vc_OP

// bitwise operators {{{1
#define Vc_OP(symbol)                                                                    \
    template <typename T>                                                                \
    Vc_INTRINSIC enable_if<std::is_integral<T>::value, Scalar::Vector<T>>                \
    operator symbol(Scalar::Vector<T> a, Scalar::Vector<T> b)                            \
    {                                                                                    \
        return a.data() symbol b.data();                                                 \
    }                                                                                    \
    template <typename T>                                                                \
    Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, Scalar::Vector<T>>          \
    operator symbol(Scalar::Vector<T> &lhs, Scalar::Vector<T> rhs)                       \
    {                                                                                    \
        using uinta =                                                                    \
            MayAlias<typename std::conditional<sizeof(T) == sizeof(int), unsigned int,   \
                                               unsigned long long>::type>;               \
        uinta *left = reinterpret_cast<uinta *>(&lhs.data());                            \
        const uinta *right = reinterpret_cast<const uinta *>(&rhs.data());               \
        *left symbol## = *right;                                                         \
        return lhs;                                                                      \
    }
Vc_ALL_BINARY(Vc_OP);
#undef Vc_OP

// arithmetic operators {{{1
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator+(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
    return a.data() + b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator-(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
    return a.data() - b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator*(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
    return a.data() * b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator/(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
    return a.data() / b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator%(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
    return a.data() % b.data();
}
// }}}1
}  // namespace Detail
}  // namespace Vc

#endif  // VC_SCALAR_OPERATORS_H_

// vim: foldmethod=marker
namespace Vc_VERSIONED_NAMESPACE
{

// special value constructors{{{1
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerZero)
    : m_data(0)
{
}
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerOne)
    : m_data(1)
{
}
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerIndexesFromZero)
    : m_data(0)
{
}

// load member functions{{{1
template <typename T>
template <typename U, typename Flags>
Vc_INTRINSIC typename Vector<T, VectorAbi::Scalar>::
#ifndef Vc_MSVC
template
#endif
load_concept<U, Flags>::type Vector<T, VectorAbi::Scalar>::load(const U *mem, Flags)
{
    m_data = mem[0];
}

// store member functions{{{1
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Flags) const
{
    mem[0] = m_data;
}
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Mask mask, Flags) const
{
    if (mask.data())
        mem[0] = m_data;
}

// gather {{{1
template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
    const MT *mem, const IT &indexes)
{
    m_data = mem[indexes[0]];
}

template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
    const MT *mem, const IT &indexes, MaskArgument mask)
{
    if (mask.data()) {
        m_data = mem[indexes[0]];
    }
}
// scatter {{{1
template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(MT *mem,
                                                                          IT &&indexes)
    const
{
    mem[indexes[0]] = m_data;
}

template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(
    MT *mem, IT &&indexes, MaskArgument mask) const
{
    if (mask.data()) {
        mem[indexes[0]] = m_data;
    }
}

// exponent {{{1
Vc_INTRINSIC Vc_CONST Scalar::float_v exponent(Scalar::float_v x)
{
    Vc_ASSERT(x.data() >= 0.f);
    union { float f; int i; } value;
    value.f = x.data();
    return Scalar::float_v(static_cast<float>((value.i >> 23) - 0x7f));
}
Vc_INTRINSIC Vc_CONST Scalar::double_v Vc_VDECL exponent(Scalar::double_v x)
{
    Vc_ASSERT(x.data() >= 0.);
    union { double f; long long i; } value;
    value.f = x.data();
    return Scalar::double_v(static_cast<double>((value.i >> 52) - 0x3ff));
}

// Random {{{1
static Vc_ALWAYS_INLINE void _doRandomStep(Scalar::uint_v &state0, Scalar::uint_v &state1)
{
    using Scalar::uint_v;
    state0.load(&Common::RandomState[0]);
    state1.load(&Common::RandomState[uint_v::Size]);
    Detail::operator+(Detail::operator*(state1, uint_v(0xdeece66du)),
                      uint_v(11))
        .store(&Common::RandomState[uint_v::Size]);
    uint_v(Detail::operator+(Detail::operator*(state0, uint_v(0xdeece66du)), uint_v(11))
               .data() ^
           (state1.data() >> 16))
        .store(&Common::RandomState[0]);
}

template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Scalar> Vector<T, VectorAbi::Scalar>::Random()
{
    Scalar::uint_v state0, state1;
    _doRandomStep(state0, state1);
    return Vector<T, VectorAbi::Scalar>(static_cast<EntryType>(state0.data()));
}
template<> Vc_INTRINSIC Scalar::float_v Scalar::float_v::Random()
{
    Scalar::uint_v state0, state1;
    _doRandomStep(state0, state1);
    union { unsigned int i; float f; } x;
    x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u;
    return Scalar::float_v(x.f - 1.f);
}
template<> Vc_INTRINSIC Scalar::double_v Scalar::double_v::Random()
{
    typedef unsigned long long uint64 Vc_MAY_ALIAS;
    uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
    state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull;
    *reinterpret_cast<uint64 *>(&Common::RandomState[8]) = state0;
    union { unsigned long long i; double f; } x;
    x.i = state0 | 0x3ff0000000000000ull;
    return Scalar::double_v(x.f - 1.);
}
// isnegative {{{1
Vc_INTRINSIC Vc_CONST Scalar::float_m isnegative(Scalar::float_v x)
{
    static_assert(sizeof(float) == sizeof(unsigned int),
                  "This code assumes float and unsigned int have the same number of "
                  "Bytes. Please file a bug report if this is a problem.");
    union { float f; unsigned int i; } u;
    u.f = x.data();
    return Scalar::float_m(0u != (u.i & 0x80000000u));
}
Vc_INTRINSIC Vc_CONST Scalar::double_m Vc_VDECL isnegative(Scalar::double_v x)
{
    static_assert(sizeof(double) == sizeof(unsigned long long),
                  "This code assumes double and unsigned long long have the same number "
                  "of Bytes. Please file a bug report if this is a problem.");
    union { double d; unsigned long long l; } u;
    u.d = x.data();
    return Scalar::double_m(0ull != (u.l & 0x8000000000000000ull));
}

// setQnan {{{1
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan()
{
    union { float f; unsigned int i; } u;
    u.i = 0xffffffffu;
    m_data = u.f;
}
template<> Vc_INTRINSIC void Scalar::double_v::setQnan()
{
    union { double d; unsigned long long l; } u;
    u.l = 0xffffffffffffffffull;
    m_data = u.d;
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan(Mask m)
{
    if (m.data()) {
        setQnan();
    }
}
template<> Vc_INTRINSIC void Scalar::double_v::setQnan(Scalar::double_v::Mask m)
{
    if (m.data()) {
        setQnan();
    }
}
// }}}1

namespace Common
{
// transpose_impl {{{1
Vc_ALWAYS_INLINE void transpose_impl(TransposeTag<1, 1>, Scalar::float_v *Vc_RESTRICT r[],
                                     const TransposeProxy<Scalar::float_v> &proxy)
{
    *r[0] = std::get<0>(proxy.in).data();
}
// }}}1
}  // namespace Common
}
// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_SIMD_CAST_H_
#define VC_SCALAR_SIMD_CAST_H_

/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_SIMD_CAST_H_
#define VC_COMMON_SIMD_CAST_H_

#include <type_traits>

// declare a bogus simd_cast function template in the global namespace to enable ADL for
// simd_cast<T>
template <class> void simd_cast();

namespace Vc_VERSIONED_NAMESPACE
{
/**
 * Casts the argument \p x from type \p From to type \p To.
 *
 * This function implements the trivial case where \p To and \p From are the same type.
 *
 * \param x The object of type \p From to be converted to type \p To.
 * \returns An object of type \p To with all vector components converted according to
 *          standard conversion behavior as mandated by the C++ standard for the
 *          underlying arithmetic types.
 */
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
{
    return std::forward<From>(x);
}

/**
 * A cast from nothing results in default-initialization of \p To.
 *
 * This function can be useful in generic code where a parameter pack expands to nothing.
 *
 * \returns A zero-initialized object of type \p To.
 */
template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }

}  // namespace Vc

#endif // VC_COMMON_SIMD_CAST_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SCALAR_TYPE_TRAITS_H_
#define VC_SCALAR_TYPE_TRAITS_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Scalar
{
namespace Traits
{
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};

template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}  // namespace Traits
}
}

#endif  // VC_SCALAR_TYPE_TRAITS_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{

// Scalar::Vector to Scalar::Vector
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
    simd_cast(Scalar::Vector<From> x, enable_if<Scalar::is_vector<To>::value> = nullarg)
{
    return static_cast<To>(x.data());
}

// Scalar::Mask to Scalar::Mask
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
    simd_cast(Scalar::Mask<From> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
{
    return static_cast<To>(x.data());
}

// Any vector (Vector<T> or SimdArray) to multiple Scalar::Vector<T>
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    T &&x,
    enable_if<Traits::is_simd_vector<T>::value && Scalar::is_vector<Return>::value> = nullarg)
{
    return Return(x[offset]);
}

template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<offset == 0 && Traits::is_simd_vector<Return>::value &&
                                    !Scalar::is_vector<Return>::value,
                                Return>
    simd_cast(Scalar::Vector<T> x)
{
    Return r{};
    r[0] = static_cast<typename Return::EntryType>(x.data());
    return r;
}


// Any mask (Mask<T> or SimdMaskArray) to multiple Scalar::Mask<T>
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    T &&x,
    enable_if<Traits::is_simd_mask<T>::value && Scalar::is_mask<Return>::value> = nullarg)
{
    return Return(bool(x[offset]));
}

template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<
    offset == 0 && Traits::is_simd_mask<Return>::value && !Scalar::is_mask<Return>::value,
    Return>
    simd_cast(Scalar::Mask<T> x)
{
    Return r(false);
    r[0] = x[0];
    return r;
}

}  // namespace Vc

#endif  // VC_SCALAR_SIMD_CAST_H_

// vim: foldmethod=marker

#endif // VC_SCALAR_VECTOR_H_

#ifdef Vc_IMPL_AVX
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_VECTOR_H_
#define VC_AVX_VECTOR_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_INTRINSICS_H_
#define VC_AVX_INTRINSICS_H_

#include <Vc/global.h>

// see comment in sse/intrinsics.h
extern "C" {
// AVX
#include <immintrin.h>

#if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
#include <x86intrin.h>
#endif
}

/*{{{
    Copyright (C) 2013-2015 Matthias Kretz <kretz@kde.org>

    Permission to use, copy, modify, and distribute this software
    and its documentation for any purpose and without fee is hereby
    granted, provided that the above copyright notice appear in all
    copies and that both that the copyright notice and this
    permission notice and warranty disclaimer appear in supporting
    documentation, and that the name of the author not be used in
    advertising or publicity pertaining to distribution of the
    software without specific, written prior permission.

    The author disclaim all warranties with regard to this
    software, including all implied warranties of merchantability
    and fitness.  In no event shall the author be liable for any
    special, indirect or consequential damages or any damages
    whatsoever resulting from loss of use, data or profits, whether
    in an action of contract, negligence or other tortious action,
    arising out of or in connection with the use or performance of
    this software.

}}}*/

#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
#define VC_COMMON_FIX_CLANG_EMMINTRIN_H_

#include <Vc/global.h>

#if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)

#ifdef _mm_slli_si128
#undef _mm_slli_si128
#define _mm_slli_si128(a, count) __extension__ ({ \
  (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
#endif

#ifdef _mm_srli_si128
#undef _mm_srli_si128
#define _mm_srli_si128(a, count) __extension__ ({ \
  (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
#endif

#ifdef _mm_shuffle_epi32
#undef _mm_shuffle_epi32
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
#endif

#ifdef _mm_shufflelo_epi16
#undef _mm_shufflelo_epi16
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
                                   4, 5, 6, 7); })
#endif

#ifdef _mm_shufflehi_epi16
#undef _mm_shufflehi_epi16
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
                                   0, 1, 2, 3, \
                                   4 + (((imm) & 0x03) >> 0), \
                                   4 + (((imm) & 0x0c) >> 2), \
                                   4 + (((imm) & 0x30) >> 4), \
                                   4 + (((imm) & 0xc0) >> 6)); })
#endif

#ifdef _mm_shuffle_pd
#undef _mm_shuffle_pd
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
  __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
#endif

#endif // Vc_CLANG || Vc_APPLECLANG

#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H_

/*  This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_CONST_DATA_H_
#define VC_AVX_CONST_DATA_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{

alignas(64) extern const unsigned int   _IndexesFromZero32[ 8];
alignas(16) extern const unsigned short _IndexesFromZero16[16];
alignas(16) extern const unsigned char  _IndexesFromZero8 [32];

struct alignas(64) c_general
{
    static const float oneFloat;
    static const unsigned int absMaskFloat[2];
    static const unsigned int signMaskFloat[2];
    static const unsigned int highMaskFloat;
    static const unsigned short minShort[2];
    static const unsigned short one16[2];
    static const float _2power31;
    static const double oneDouble;
    static const unsigned long long frexpMask;
    static const unsigned long long highMaskDouble;
};

template<typename T> struct c_trig
{
    alignas(64) static const T data[];
};
#ifndef Vc_MSVC
template <> alignas(64) const float c_trig<float>::data[];
template <> alignas(64) const double c_trig<double>::data[];
#endif

template<typename T> struct c_log
{
    typedef float floatAlias Vc_MAY_ALIAS;
    static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
    alignas(64) static const unsigned int data[21];
};
#ifndef Vc_MSVC
template<> alignas(64) const unsigned int c_log<float>::data[21];
#endif

template<> struct c_log<double>
{
    enum VectorSize { Size = 16 / sizeof(double) };
    typedef double doubleAlias Vc_MAY_ALIAS;
    static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
    alignas(64) static const unsigned long long data[21];
};

}  // namespace AVX
}  // namespace Vc

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX2
{
    using AVX::_IndexesFromZero8;
    using AVX::_IndexesFromZero16;
    using AVX::_IndexesFromZero32;
    using AVX::c_general;
    using AVX::c_trig;
    using AVX::c_log;
}  // namespace AVX2
}  // namespace Vc

#endif // VC_AVX_CONST_DATA_H_
#include <cstdlib>

#if (defined Vc_CLANG && Vc_CLANG >= 0x30900)
#ifdef _mm256_permute2f128_si256
#undef _mm256_permute2f128_si256
#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
                                           (__v8si)(__m256i)(V2), (char)(M)); })
#endif

#ifdef _mm256_permute2f128_ps
#undef _mm256_permute2f128_ps
#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
                                          (__v8sf)(__m256)(V2), (char)(M)); })
#endif

#ifdef _mm256_permute2x128_si256
#undef _mm256_permute2x128_si256
#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
#endif
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace AvxIntrinsics
{
    using AVX::c_general;
    using AVX::_IndexesFromZero32;
    using AVX::_IndexesFromZero16;
    using AVX::_IndexesFromZero8;

    typedef __m128  m128 ;
    typedef __m128d m128d;
    typedef __m128i m128i;
    typedef __m256  m256 ;
    typedef __m256d m256d;
    typedef __m256i m256i;

    typedef const m128  param128 ;
    typedef const m128d param128d;
    typedef const m128i param128i;
    typedef const m256  param256 ;
    typedef const m256d param256d;
    typedef const m256i param256i;

#ifdef Vc_GCC
    // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
    // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
    static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
    static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
    static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
    static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
    static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
    static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
#endif

    static Vc_INTRINSIC m256  Vc_CONST set1_ps   (float  a) { return _mm256_set1_ps   (a); }
    static Vc_INTRINSIC m256d Vc_CONST set1_pd   (double a) { return _mm256_set1_pd   (a); }
    static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int    a) { return _mm256_set1_epi32(a); }
    //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); }

    static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
    static Vc_INTRINSIC Vc_CONST m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
    static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }

    static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
    static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
    static Vc_INTRINSIC Vc_CONST m256  setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }

    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 ()  { return _mm_set1_epi8(1); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 ()  { return _mm_setone_epi8(); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16()  { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16()  { return _mm_setone_epi16(); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32()  { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu32()  { return _mm_setone_epi32(); }

    static Vc_INTRINSIC m256i Vc_CONST setone_epi8 ()  { return _mm256_set1_epi8(1); }
    static Vc_INTRINSIC m256i Vc_CONST setone_epu8 ()  { return setone_epi8(); }
    static Vc_INTRINSIC m256i Vc_CONST setone_epi16()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
    static Vc_INTRINSIC m256i Vc_CONST setone_epu16()  { return setone_epi16(); }
    static Vc_INTRINSIC m256i Vc_CONST setone_epi32()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
    static Vc_INTRINSIC m256i Vc_CONST setone_epu32()  { return setone_epi32(); }

    static Vc_INTRINSIC m256  Vc_CONST setone_ps()     { return _mm256_broadcast_ss(&c_general::oneFloat); }
    static Vc_INTRINSIC m256d Vc_CONST setone_pd()     { return _mm256_broadcast_sd(&c_general::oneDouble); }

    static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
    static Vc_INTRINSIC m256  Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
    static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
    static Vc_INTRINSIC m256  Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }

    static Vc_INTRINSIC m256  Vc_CONST set2power31_ps()    { return _mm256_broadcast_ss(&c_general::_2power31); }
    static Vc_INTRINSIC m128  Vc_CONST _mm_set2power31_ps()    { return _mm_broadcast_ss(&c_general::_2power31); }
    static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }

    static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
    static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
    static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
    static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }

    template <int i>
    static Vc_INTRINSIC Vc_CONST unsigned char extract_epu8(__m128i x)
    {
        return _mm_extract_epi8(x, i);
    }
    template <int i>
    static Vc_INTRINSIC Vc_CONST unsigned short extract_epu16(__m128i x)
    {
        return _mm_extract_epi16(x, i);
    }
    template <int i>
    static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
    {
        return _mm_extract_epi32(x, i);
    }

    template <int offset> Vc_INTRINSIC __m256  insert128(__m256  a, __m128  b) { return _mm256_insertf128_ps(a, b, offset); }
    template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
    template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
#ifdef Vc_IMPL_AVX2
        return _mm256_inserti128_si256(a, b, offset);
#else
        return _mm256_insertf128_si256(a, b, offset);
#endif
    }

    template <int offset> Vc_INTRINSIC __m128  extract128(__m256  a) { return _mm256_extractf128_ps(a, offset); }
    template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
    template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
#ifdef Vc_IMPL_AVX2
        return _mm256_extracti128_si256(a, offset);
#else
        return _mm256_extractf128_si256(a, offset);
#endif
    }

    /////////////////////// COMPARE OPS ///////////////////////
    static Vc_INTRINSIC m256d Vc_CONST cmpeq_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
    static Vc_INTRINSIC m256d Vc_CONST cmpneq_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
    static Vc_INTRINSIC m256d Vc_CONST cmplt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
    static Vc_INTRINSIC m256d Vc_CONST cmpnlt_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
    static Vc_INTRINSIC m256d Vc_CONST cmpge_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
    static Vc_INTRINSIC m256d Vc_CONST cmple_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
    static Vc_INTRINSIC m256d Vc_CONST cmpnle_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
    static Vc_INTRINSIC m256d Vc_CONST cmpgt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
    static Vc_INTRINSIC m256d Vc_CONST cmpord_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
    static Vc_INTRINSIC m256d Vc_CONST cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }

    static Vc_INTRINSIC m256  Vc_CONST cmpeq_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
    static Vc_INTRINSIC m256  Vc_CONST cmpneq_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
    static Vc_INTRINSIC m256  Vc_CONST cmplt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
    static Vc_INTRINSIC m256  Vc_CONST cmpnlt_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
    static Vc_INTRINSIC m256  Vc_CONST cmpge_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
    static Vc_INTRINSIC m256  Vc_CONST cmple_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
    static Vc_INTRINSIC m256  Vc_CONST cmpnle_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
    static Vc_INTRINSIC m256  Vc_CONST cmpgt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
    static Vc_INTRINSIC m256  Vc_CONST cmpord_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
    static Vc_INTRINSIC m256  Vc_CONST cmpunord_ps(__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }

#if defined(Vc_IMPL_XOP)
    static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
        return _mm_comlt_epu16(a, b);
    }
    static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
        return _mm_comgt_epu16(a, b);
    }
#else
    static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
        return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
    }
    static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
        return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
    }
#endif

#ifdef Vc_IMPL_AVX2
    template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
    {
        return _mm256_alignr_epi8(s1, s2, shift);
    }
#else
    template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
    {
        return insert128<1>(
            _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
                                                   _mm256_castsi256_si128(s2), shift)),
            _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
    }
#endif

#ifdef Vc_IMPL_AVX2
#define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
    {                                                                                    \
        return _mm256_##name(a0, b0);                                                    \
    }
#define Vc_AVX_TO_SSE_256_128(name)                                                      \
    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
    {                                                                                    \
        return _mm256_##name(a0, b0);                                                    \
    }
#define Vc_AVX_TO_SSE_1i(name)                                                           \
    template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
    {                                                                                    \
        return _mm256_##name(a0, i);                                                     \
    }
#define Vc_AVX_TO_SSE_1(name)                                                            \
    Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
#define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
    Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
#else
/**\internal
 * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low
 * and high 128 bit halfs of the arguments.
 *
 * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single
 * `_mm256_##name` call.
 */
#define Vc_AVX_TO_SSE_1(name)                                                            \
    Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0)                                       \
    {                                                                                    \
        __m128i a1 = extract128<1>(a0);                                                  \
        __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0));                             \
        __m128i r1 = _mm_##name(a1);                                                     \
        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
    }
#define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
    Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0)                                       \
    {                                                                                    \
        __m128i r0 = _mm_##name(a0);                                                     \
        __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__));                            \
        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
    }
#define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
    {                                                                                    \
        m128i a1 = extract128<1>(a0);                                                    \
        m128i b1 = extract128<1>(b0);                                                    \
        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0));   \
        m128i r1 = _mm_##name(a1, b1);                                                   \
        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
    }
#define Vc_AVX_TO_SSE_256_128(name)                                                      \
    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
    {                                                                                    \
        m128i a1 = extract128<1>(a0);                                                    \
        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0);                           \
        m128i r1 = _mm_##name(a1, b0);                                                   \
        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
    }
#define Vc_AVX_TO_SSE_1i(name)                                                           \
    template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
    {                                                                                    \
        m128i a1 = extract128<1>(a0);                                                    \
        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i);                            \
        m128i r1 = _mm_##name(a1, i);                                                    \
        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
    }
#endif
    Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
    Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }

    Vc_AVX_TO_SSE_1i(slli_epi16)
    Vc_AVX_TO_SSE_1i(slli_epi32)
    Vc_AVX_TO_SSE_1i(slli_epi64)
    Vc_AVX_TO_SSE_1i(srai_epi16)
    Vc_AVX_TO_SSE_1i(srai_epi32)
    Vc_AVX_TO_SSE_1i(srli_epi16)
    Vc_AVX_TO_SSE_1i(srli_epi32)
    Vc_AVX_TO_SSE_1i(srli_epi64)

    Vc_AVX_TO_SSE_256_128(sll_epi16)
    Vc_AVX_TO_SSE_256_128(sll_epi32)
    Vc_AVX_TO_SSE_256_128(sll_epi64)
    Vc_AVX_TO_SSE_256_128(srl_epi16)
    Vc_AVX_TO_SSE_256_128(srl_epi32)
    Vc_AVX_TO_SSE_256_128(srl_epi64)
    Vc_AVX_TO_SSE_256_128(sra_epi16)
    Vc_AVX_TO_SSE_256_128(sra_epi32)

    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
    Vc_AVX_TO_SSE_2_NEW(packs_epi16)
    Vc_AVX_TO_SSE_2_NEW(packs_epi32)
    Vc_AVX_TO_SSE_2_NEW(packus_epi16)
    Vc_AVX_TO_SSE_2_NEW(unpackhi_epi8)
    Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
    Vc_AVX_TO_SSE_2_NEW(unpackhi_epi32)
    Vc_AVX_TO_SSE_2_NEW(unpackhi_epi64)
    Vc_AVX_TO_SSE_2_NEW(unpacklo_epi8)
    Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
    Vc_AVX_TO_SSE_2_NEW(unpacklo_epi32)
    Vc_AVX_TO_SSE_2_NEW(unpacklo_epi64)
    Vc_AVX_TO_SSE_2_NEW(add_epi8)
    Vc_AVX_TO_SSE_2_NEW(add_epi16)
    Vc_AVX_TO_SSE_2_NEW(add_epi32)
    Vc_AVX_TO_SSE_2_NEW(add_epi64)
    Vc_AVX_TO_SSE_2_NEW(adds_epi8)
    Vc_AVX_TO_SSE_2_NEW(adds_epi16)
    Vc_AVX_TO_SSE_2_NEW(adds_epu8)
    Vc_AVX_TO_SSE_2_NEW(adds_epu16)
    Vc_AVX_TO_SSE_2_NEW(sub_epi8)
    Vc_AVX_TO_SSE_2_NEW(sub_epi16)
    Vc_AVX_TO_SSE_2_NEW(sub_epi32)
    Vc_AVX_TO_SSE_2_NEW(sub_epi64)
    Vc_AVX_TO_SSE_2_NEW(subs_epi8)
    Vc_AVX_TO_SSE_2_NEW(subs_epi16)
    Vc_AVX_TO_SSE_2_NEW(subs_epu8)
    Vc_AVX_TO_SSE_2_NEW(subs_epu16)
    Vc_AVX_TO_SSE_2_NEW(madd_epi16)
    Vc_AVX_TO_SSE_2_NEW(mulhi_epi16)
    Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
    Vc_AVX_TO_SSE_2_NEW(mul_epu32)
    Vc_AVX_TO_SSE_2_NEW(max_epi16)
    Vc_AVX_TO_SSE_2_NEW(max_epu8)
    Vc_AVX_TO_SSE_2_NEW(min_epi16)
    Vc_AVX_TO_SSE_2_NEW(min_epu8)
    Vc_AVX_TO_SSE_2_NEW(mulhi_epu16)
    // shufflehi_epi16
    // shufflelo_epi16 (__m128i __A, const int __mask)
    // shuffle_epi32 (__m128i __A, const int __mask)
    // maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
    Vc_AVX_TO_SSE_2_NEW(avg_epu8)
    Vc_AVX_TO_SSE_2_NEW(avg_epu16)
    Vc_AVX_TO_SSE_2_NEW(sad_epu8)
    // stream_si32 (int *__A, int __B)
    // stream_si128 (__m128i *__A, __m128i __B)
    // cvtsi32_si128 (int __A)
    // cvtsi64_si128 (long long __A)
    // cvtsi64x_si128 (long long __A)
    Vc_AVX_TO_SSE_2_NEW(hadd_epi16)
    Vc_AVX_TO_SSE_2_NEW(hadd_epi32)
    Vc_AVX_TO_SSE_2_NEW(hadds_epi16)
    Vc_AVX_TO_SSE_2_NEW(hsub_epi16)
    Vc_AVX_TO_SSE_2_NEW(hsub_epi32)
    Vc_AVX_TO_SSE_2_NEW(hsubs_epi16)
    Vc_AVX_TO_SSE_2_NEW(maddubs_epi16)
    Vc_AVX_TO_SSE_2_NEW(mulhrs_epi16)
    Vc_AVX_TO_SSE_2_NEW(shuffle_epi8)
    Vc_AVX_TO_SSE_2_NEW(sign_epi8)
    Vc_AVX_TO_SSE_2_NEW(sign_epi16)
    Vc_AVX_TO_SSE_2_NEW(sign_epi32)
    Vc_AVX_TO_SSE_2_NEW(min_epi8)
    Vc_AVX_TO_SSE_2_NEW(max_epi8)
    Vc_AVX_TO_SSE_2_NEW(min_epu16)
    Vc_AVX_TO_SSE_2_NEW(max_epu16)
    Vc_AVX_TO_SSE_2_NEW(min_epi32)
    Vc_AVX_TO_SSE_2_NEW(max_epi32)
    Vc_AVX_TO_SSE_2_NEW(min_epu32)
    Vc_AVX_TO_SSE_2_NEW(max_epu32)
    Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
    Vc_AVX_TO_SSE_2_NEW(mul_epi32)

    Vc_AVX_TO_SSE_1(abs_epi8)
    Vc_AVX_TO_SSE_1(abs_epi16)
    Vc_AVX_TO_SSE_1(abs_epi32)
    Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
    Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
    Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
    Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
    Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
    Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
    Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
    Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
    Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
    Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
    Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
    Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)

    Vc_AVX_TO_SSE_2_NEW(packus_epi32)

#ifndef Vc_IMPL_AVX2

/////////////////////////////////////////////////////////////////////////
// implementation of the intrinsics missing in AVX
/////////////////////////////////////////////////////////////////////////

    template <int i> Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0) {
        const __m128i vLo = _mm256_castsi256_si128(a0);
        const __m128i vHi = extract128<1>(a0);
        return insert128<1>(_mm256_castsi128_si256(_mm_srli_si128(vLo,  i)), _mm_srli_si128(vHi, i));
    }
    template <int i> Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0) {
        const __m128i vLo = _mm256_castsi256_si128(a0);
        const __m128i vHi = extract128<1>(a0);
        return insert128<1>(_mm256_castsi128_si256(_mm_slli_si128(vLo,  i)), _mm_slli_si128(vHi, i));
    }

    static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
    }
    static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
        return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
    }
    static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
    }
    static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
        return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
    }

    Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
    {
        m128i a1 = extract128<1>(a0);
        return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
    }
    template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(param256i a0, param256i b0)
    {
        m128i a1 = extract128<1>(a0);
        m128i b1 = extract128<1>(b0);
        m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
        m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
        return insert128<1>(_mm256_castsi128_si256(r0), r1);
    }
    Vc_INTRINSIC Vc_CONST m256i blendv_epi8(param256i a0, param256i b0, param256i m0) {
        m128i a1 = extract128<1>(a0);
        m128i b1 = extract128<1>(b0);
        m128i m1 = extract128<1>(m0);
        m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
        m128i r1 = _mm_blendv_epi8(a1, b1, m1);
        return insert128<1>(_mm256_castsi128_si256(r0), r1);
    }
    // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
    // stream_load_si128 (__m128i *__X)

#else // Vc_IMPL_AVX2

static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }

template <int i> Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0)
{
    return _mm256_srli_si256(a0, i);
}
template <int i> Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0)
{
    return _mm256_slli_si256(a0, i);
}

/////////////////////////////////////////////////////////////////////////
// implementation of the intrinsics missing in AVX2
/////////////////////////////////////////////////////////////////////////
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
{
    return _mm256_blendv_epi8(a0, b0, m0);
}
Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
    return _mm256_movemask_epi8(a0);
}

#endif // Vc_IMPL_AVX2

/////////////////////////////////////////////////////////////////////////
// implementation of intrinsics missing in AVX and AVX2
/////////////////////////////////////////////////////////////////////////

static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
    return cmpgt_epi64(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
    return cmpgt_epi32(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
    return cmpgt_epi16(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
    return cmpgt_epi8(b, a);
}

static Vc_INTRINSIC m256i cmplt_epu8(__m256i a, __m256i b) {
    return cmplt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
}
static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
    return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
}
#if defined(Vc_IMPL_XOP)
    Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
    Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
    Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
    Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
#else
    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
        return cmplt_epi32(a, b);
    }
    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
        return cmpgt_epi32(a, b);
    }
    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
        return cmplt_epi16(a, b);
    }
    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
        return cmpgt_epi16(a, b);
    }
#endif

static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
    _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
    _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
#ifdef Vc_IMPL_AVX2
    _mm256_maskstore_epi32(mem, mask, v);
#else
    _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
#endif
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
    _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
}
static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
    using namespace AVX;
    _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
    _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
    _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}

#undef Vc_AVX_TO_SSE_1
#undef Vc_AVX_TO_SSE_1_128
#undef Vc_AVX_TO_SSE_2_NEW
#undef Vc_AVX_TO_SSE_256_128
#undef Vc_AVX_TO_SSE_1i

template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
{
    return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
}
template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
{
    return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
                                stream_load<m128>(mem + 4));
}

template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
{
    return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
}
template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
{
    return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
                                stream_load<m128d>(mem + 2));
}

template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
{
    return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
}
template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
{
    return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
                                stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
}

Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
{
    _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
{
    stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
    stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
{
    _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
{
    stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
    stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
{
    _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
{
    stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
    stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
}

#ifndef __x86_64__
Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
    return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
}
#endif

}  // namespace AvxIntrinsics
}  // namespace Vc

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
    using namespace AvxIntrinsics;
}  // namespace AVX
namespace AVX2
{
    using namespace AvxIntrinsics;
}  // namespace AVX2
namespace AVX
{
    template<typename T> struct VectorTypeHelper;
    template<> struct VectorTypeHelper<         char > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<  signed char > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<         short> { typedef __m256i Type; };
    template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
    template<> struct VectorTypeHelper<         int  > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<unsigned int  > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<         long > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
    template<> struct VectorTypeHelper<         long long> { typedef __m256i Type; };
    template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
    template<> struct VectorTypeHelper<         float> { typedef __m256  Type; };
    template<> struct VectorTypeHelper<        double> { typedef __m256d Type; };

    template<typename T> struct SseVectorType;
    template<> struct SseVectorType<__m256 > { typedef __m128  Type; };
    template<> struct SseVectorType<__m256i> { typedef __m128i Type; };
    template<> struct SseVectorType<__m256d> { typedef __m128d Type; };
    template<> struct SseVectorType<__m128 > { typedef __m128  Type; };
    template<> struct SseVectorType<__m128i> { typedef __m128i Type; };
    template<> struct SseVectorType<__m128d> { typedef __m128d Type; };

    template <typename T>
    using IntegerVectorType =
        typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
    template <typename T>
    using DoubleVectorType =
        typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
    template <typename T>
    using FloatVectorType =
        typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;

    template<typename T> struct VectorHelper {};
    template<typename T> struct GatherHelper;
    template<typename T> struct ScatterHelper;

    template<typename T> struct HasVectorDivisionHelper { enum { Value = 1 }; };
    template<typename T> struct VectorHelperSize;
}  // namespace AVX
}  // namespace Vc

#endif // VC_AVX_INTRINSICS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_CASTS_H_
#define VC_AVX_CASTS_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_CASTS_H_
#define VC_SSE_CASTS_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_INTRINSICS_H_
#define VC_SSE_INTRINSICS_H_

#ifdef Vc_MSVC
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

/*  This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_STORAGE_H_
#define VC_COMMON_STORAGE_H_

/*  This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
#define VC_COMMON_ALIASINGENTRYHELPER_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

template<class StorageType> class AliasingEntryHelper
{
    private:
        typedef typename StorageType::EntryType T;
#ifdef Vc_ICC
        StorageType *const m_storage;
        const int m_index;
    public:
        Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
        Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
        Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
            m_storage->assign(m_index, rhs);
            return *this;
        }

        Vc_ALWAYS_INLINE AliasingEntryHelper &operator  =(T x) { m_storage->assign(m_index, x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
#define m_data m_storage->read(m_index)
#else
        typedef T A Vc_MAY_ALIAS;
        A &m_data;
    public:
        template<typename T2>
        Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}

        Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
            m_data = rhs.m_data;
            return *this;
        }

        Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data  = x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
        Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
#endif

        Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }

        Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
        Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
        Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
        Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
        Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) <  x; }
        Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) >  x; }

        Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
        Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
        Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
        Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
        //T operator<<(T x) const { return static_cast<T>(m_data) << x; }
        //T operator>>(T x) const { return static_cast<T>(m_data) >> x; }
#ifdef m_data
#undef m_data
#endif
};

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_ALIASINGENTRYHELPER_H_
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_MASKENTRY_H_
#define VC_COMMON_MASKENTRY_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

namespace
{
    template<size_t Bytes> struct MaskBoolStorage;
    // the following for typedefs must use std::intN_t and NOT! Vc::intN_t. The latter
    // segfaults ICC 15.0.3.
    template<> struct MaskBoolStorage<1> { typedef std::int8_t  type; };
    template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
    template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
    template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
} // anonymous namespace

template<size_t Bytes> class MaskBool
{
    typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
    storage_type data;
public:
    constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
    Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
    template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
                                                std::is_fundamental<T>::value)>>
    Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
    {
        data = reinterpret_cast<const storage_type &>(x);
        return *this;
    }

    Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
    Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;

    template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
                                                (std::is_fundamental<T>::value &&
                                                 sizeof(storage_type) == sizeof(T)))>>
    constexpr operator T() const noexcept
    {
        return std::is_same<T, bool>::value ? T((data & 1) != 0)
                                            : reinterpret_cast<const MayAlias<T> &>(data);
    }
} Vc_MAY_ALIAS;

template <typename A,
          typename B,
          typename std::enable_if<
              std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
              int>::type = 0>
constexpr bool operator==(A &&a, B &&b)
{
    return static_cast<bool>(a) == static_cast<bool>(b);
}
template <typename A,
          typename B,
          typename std::enable_if<
              std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
              int>::type = 0>
constexpr bool operator!=(A &&a, B &&b)
{
    return static_cast<bool>(a) != static_cast<bool>(b);
}

static_assert(true == MaskBool<4>(true), "true == MaskBool<4>(true)");
static_assert(true != MaskBool<4>(false), "true != MaskBool<4>(false)");

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_MASKENTRY_H_
#ifdef Vc_IMPL_AVX
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename V> inline V zero();
}  // namespace Detail
namespace Common
{
namespace Detail
{
#ifdef Vc_IMPL_AVX
template <typename ValueType, size_t Size> struct IntrinsicType {
    using type = typename std::conditional<
        std::is_integral<ValueType>::value,
        typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
        typename std::conditional<
            std::is_same<ValueType, double>::value,
            typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
                                      __m256d>::type,
            typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
                                      __m256>::type>::type>::type;
};
#elif defined Vc_IMPL_SSE
template <typename ValueType, size_t Size> struct IntrinsicType {
    using type = typename std::conditional<
        std::is_integral<ValueType>::value, __m128i,
        typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
                                  __m128>::type>::type;
};
#elif defined Vc_IMPL_MIC
template <typename ValueType, size_t Size> struct IntrinsicType {
    using type = typename std::conditional<
        std::is_integral<ValueType>::value, __m512i,
        typename std::conditional<std::is_same<ValueType, double>::value, __m512d,
                                  __m512>::type>::type;
};
#else
template <typename ValueType, size_t Size> struct IntrinsicType {
    static_assert(Size == 1,
                  "IntrinsicType without SIMD target support may only have Size = 1");
    using type = ValueType;
};
#endif
template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
struct BuiltinType;
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
#define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
template <size_t Size> struct BuiltinType<         double   , Size, 16> { typedef          double    type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         float    , Size, 16> { typedef          float     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         long long, Size, 16> { typedef          long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         long     , Size, 16> { typedef          long      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long     , Size, 16> { typedef unsigned long      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         int      , Size, 16> { typedef          int       type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int      , Size, 16> { typedef unsigned int       type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         short    , Size, 16> { typedef          short     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short    , Size, 16> { typedef unsigned short     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         char     , Size, 16> { typedef          char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char     , Size, 16> { typedef unsigned char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<  signed char     , Size, 16> { typedef   signed char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         bool     , Size, 16> { typedef unsigned char      type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
template <size_t Size> struct BuiltinType<         double   , Size, 32> { typedef          double    type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         float    , Size, 32> { typedef          float     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         long long, Size, 32> { typedef          long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         long     , Size, 32> { typedef          long      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long     , Size, 32> { typedef unsigned long      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         int      , Size, 32> { typedef          int       type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int      , Size, 32> { typedef unsigned int       type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         short    , Size, 32> { typedef          short     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short    , Size, 32> { typedef unsigned short     type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         char     , Size, 32> { typedef          char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char     , Size, 32> { typedef unsigned char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<  signed char     , Size, 32> { typedef   signed char      type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<         bool     , Size, 32> { typedef unsigned char      type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#endif
}  // namespace Detail

template <typename ValueType, size_t Size>
using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;

template <typename ValueType, size_t Size>
using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;

namespace AliasStrategy
{
struct Union {};
struct MayAlias {};
struct VectorBuiltin {};
struct UnionMembers {};
}  // namespace AliasStrategy

using DefaultStrategy =
#if defined Vc_USE_BUILTIN_VECTOR_TYPES
    AliasStrategy::VectorBuiltin;
#elif defined Vc_MSVC
    AliasStrategy::UnionMembers;
#elif defined Vc_ICC
    AliasStrategy::Union;
#elif defined __GNUC__
    AliasStrategy::MayAlias;
#else
    AliasStrategy::Union;
#endif

template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
class Storage;

// GCC 6 forbids `EntryType m[]` altogether
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::Union>
{
    static_assert(std::is_fundamental<ValueType>::value &&
                      std::is_arithmetic<ValueType>::value,
                  "Only works for fundamental arithmetic types.");

public:
    using VectorType = IntrinsicType<ValueType, Size>;
    using EntryType = ValueType;

    union Alias {
        Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
        VectorType v;
        EntryType m[Size];
    };

    Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
    Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
    template <typename U>
    Vc_INTRINSIC explicit Storage(const U &x,
                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
        : data(reinterpret_cast<VectorType>(x))
    {
        assertCorrectAlignment(&data);
    }

    Vc_INTRINSIC Storage(const Storage &) = default;
    Vc_INTRINSIC Storage &operator=(const Storage &) = default;

    Vc_INTRINSIC operator const VectorType &() const { return data; }
    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
    Vc_INTRINSIC void set(size_t i, EntryType x)
    {
        Alias a(data);
        a.m[i] = x;
        data = a.v;
    }

private:
    VectorType data;
};

template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::MayAlias>
{
    static_assert(std::is_fundamental<ValueType>::value &&
                      std::is_arithmetic<ValueType>::value,
                  "Only works for fundamental arithmetic types.");

public:
    using VectorType = IntrinsicType<ValueType, Size>;
    using EntryType = ValueType;

    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
    Vc_INTRINSIC Storage(const VectorType &x) : data(x)
    {
        assertCorrectAlignment(&data);
    }
    template <typename U>
    Vc_INTRINSIC explicit Storage(const U &x,
                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
        : data(reinterpret_cast<const VectorType &>(x))
    {
        assertCorrectAlignment(&data);
    }
    Vc_INTRINSIC Storage &operator=(const VectorType &x)
    {
        data = x;
        return *this;
    }

    Vc_INTRINSIC Storage(const Storage &) = default;
    Vc_INTRINSIC Storage &operator=(const Storage &) = default;

    Vc_INTRINSIC operator const VectorType &() const { return v(); }
    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }

    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
    {
        return reinterpret_cast<const MayAlias<EntryType> *>(&data)[i];
    }
    Vc_INTRINSIC void set(size_t i, EntryType x)
    {
        reinterpret_cast<MayAlias<EntryType> *>(&data)[i] = x;
    }

private:
    VectorType data;
};

template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
{
    static_assert(std::is_fundamental<ValueType>::value &&
                      std::is_arithmetic<ValueType>::value,
                  "Only works for fundamental arithmetic types.");

    using Builtin = BuiltinType<ValueType, Size>;

public:
    using VectorType =
#ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
        MayAlias<IntrinsicType<ValueType, Size>>;
#else
        IntrinsicType<ValueType, Size>;
#endif
    using EntryType = ValueType;

    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
    Vc_INTRINSIC Storage(const VectorType &x)
        : data(reinterpret_cast<const MayAlias<Builtin> &>(x))
    {
        assertCorrectAlignment(&data);
    }
    template <typename U>
    Vc_INTRINSIC explicit Storage(const U &x,
                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
        : data(reinterpret_cast<const MayAlias<Builtin> &>(x))
    {
        assertCorrectAlignment(&data);
    }
    Vc_INTRINSIC Storage &operator=(const VectorType &x)
    {
        data = reinterpret_cast<const MayAlias<Builtin> &>(x);
        return *this;
    }

    Vc_INTRINSIC Storage(const Storage &) = default;
    Vc_INTRINSIC Storage &operator=(const Storage &) = default;

    Vc_INTRINSIC operator const VectorType &() const { return v(); }
    Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }

    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
    Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }

    Vc_INTRINSIC Builtin &builtin() { return data; }
    Vc_INTRINSIC const Builtin &builtin() const { return data; }

private:
    Builtin data;
};

template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::UnionMembers>
{
    static_assert(std::is_fundamental<ValueType>::value &&
                      std::is_arithmetic<ValueType>::value,
                  "Only works for fundamental arithmetic types.");

public:
    using VectorType = IntrinsicType<ValueType, Size>;
    using EntryType = ValueType;

    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
    Vc_INTRINSIC Storage(const VectorType &x) : data(x)
    {
        assertCorrectAlignment(&data);
    }
    template <typename U>
    Vc_INTRINSIC explicit Storage(const U &x,
                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
        : data(reinterpret_cast<const VectorType &>(x))
    {
        assertCorrectAlignment(&data);
    }
    Vc_INTRINSIC Storage &operator=(const VectorType &x)
    {
        data = x;
        return *this;
    }

    Vc_INTRINSIC Storage(const Storage &) = default;
    Vc_INTRINSIC Storage &operator=(const Storage &) = default;

    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }

    Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
    Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }

private:
    Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
    VectorType data;
};

#ifdef Vc_MSVC
template <> Vc_INTRINSIC Vc_PURE          double Storage<         double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE          float  Storage<         float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed int    Storage<  signed int   , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed short  Storage<  signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed char   Storage<  signed char  ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int    Storage<unsigned int   , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short  Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char   Storage<unsigned char  ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }

template <> Vc_INTRINSIC Vc_PURE          double &Storage<         double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE          float  &Storage<         float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed int    &Storage<  signed int   , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed short  &Storage<  signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed char   &Storage<  signed char  ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int    &Storage<unsigned int   , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short  &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char   &Storage<unsigned char  ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }

#ifdef Vc_IMPL_AVX
template <> Vc_INTRINSIC Vc_PURE          double Storage<         double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE          float  Storage<         float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed int    Storage<  signed int   , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed short  Storage<  signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed char   Storage<  signed char  ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int    Storage<unsigned int   , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short  Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char   Storage<unsigned char  ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }

template <> Vc_INTRINSIC Vc_PURE          double &Storage<         double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE          float  &Storage<         float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed int    &Storage<  signed int   , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed short  &Storage<  signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE   signed char   &Storage<  signed char  ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int    &Storage<unsigned int   , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short  &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char   &Storage<unsigned char  ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
#endif
#endif  // Vc_MSVC

template <typename VectorType, typename EntryType>
using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_STORAGE_H_
/*  This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_CONST_DATA_H_
#define VC_SSE_CONST_DATA_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


#ifndef VC_SSE_MACROS_H_
#define VC_SSE_MACROS_H_

#ifndef _M128
# define _M128 __m128
#endif

#ifndef _M128I
# define _M128I __m128i
#endif

#ifndef _M128D
# define _M128D __m128d
#endif

#if defined(Vc_IMPL_SSE4_1) && !defined(Vc_DISABLE_PTEST)
#define Vc_USE_PTEST
#endif

#endif // VC_SSE_MACROS_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{

alignas(16) extern const unsigned int   _IndexesFromZero4[4];
alignas(16) extern const unsigned short _IndexesFromZero8[8];
alignas(16) extern const unsigned char  _IndexesFromZero16[16];

struct c_general
{
    alignas(64) static const int absMaskFloat[4];
    alignas(16) static const unsigned int signMaskFloat[4];
    alignas(16) static const unsigned int highMaskFloat[4];
    alignas(16) static const short minShort[8];

    alignas(16) static const unsigned short one16[8];
    alignas(16) static const unsigned int one32[4];
    alignas(16) static const float oneFloat[4];

    alignas(16) static const unsigned long long highMaskDouble[2];
    alignas(16) static const double oneDouble[2];
    alignas(16) static const long long absMaskDouble[2];
    alignas(16) static const unsigned long long signMaskDouble[2];
    alignas(16) static const unsigned long long frexpMask[2];
};

template<typename T> struct c_trig
{
    alignas(64) static const T data[];
};
#ifndef Vc_MSVC
template <> alignas(64) const float c_trig<float>::data[];
template <> alignas(64) const double c_trig<double>::data[];
#endif

template<typename T> struct c_log
{
    enum VectorSize { Size = 16 / sizeof(T) };
    static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast<const  float *>(&data[i * Size]); }
    alignas(64) static const unsigned int data[21 * Size];
};
#ifndef Vc_MSVC
template<> alignas(64) const unsigned int c_log<float>::data[21 * 4];
#endif

template<> struct c_log<double>
{
    enum VectorSize { Size = 16 / sizeof(double) };
    static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast<const double *>(&data[i * Size]); }
    alignas(64) static const unsigned long long data[21 * Size];
};

}  // namespace SSE
}  // namespace Vc

#endif // VC_SSE_CONST_DATA_H_
#include <cstdlib>
/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_DEBUG_H_
#define VC_SSE_DEBUG_H_

#ifndef NDEBUG
#include <iostream>
#include <iomanip>
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{

#ifdef NDEBUG
class DebugStream
{
    public:
        DebugStream(const char *, const char *, int) {}
        template<typename T> inline DebugStream &operator<<(const T &) { return *this; }
};
#else
class DebugStream
{
    private:
        static char hexChar(char x) { return x + (x > 9 ? 87 : 48); }
        template<typename T, typename V> static void printVector(V _x)
        {
            std::cerr << "0x";
            const auto bytes = reinterpret_cast<const std::uint8_t *>(&_x);
            for (std::size_t i = 0; i < sizeof(V); ++i) {
                std::cerr << hexChar(bytes[i] >> 4) << hexChar(bytes[i] & 0xf);
                if (i % 4 == 3) {
                    std::cerr << '\'';
                }
            }

            enum { Size = sizeof(V) / sizeof(T) };
            union { V v; T m[Size]; } x = { _x };
            std::cerr << " = [" << std::setprecision(24) << x.m[0];
            for (int i = 1; i < Size; ++i) {
                std::cerr << ", " << std::setprecision(24) << x.m[i];
            }
            std::cerr << ']';
        }
    public:
        DebugStream(const char *func, const char *file, int line)
        {
            std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' ';
        }

        template<typename T> DebugStream &operator<<(const T &x) { std::cerr << x; return *this; }

        DebugStream &operator<<(__m128 x) {
            printVector<float, __m128>(x);
            return *this;
        }
        DebugStream &operator<<(__m128d x) {
            printVector<double, __m128d>(x);
            return *this;
        }
        DebugStream &operator<<(__m128i x) {
            printVector<int, __m128i>(x);
            return *this;
        }

        ~DebugStream()
        {
            std::cerr << "\033[0m" << std::endl;
        }
};
#endif

#define Vc_DEBUG Vc::SSE::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__)

}  // namespace SSE
}  // namespace Vc

#endif // VC_SSE_DEBUG_H_

#if defined(Vc_GCC) && !defined(__OPTIMIZE__)
// GCC uses lots of old-style-casts in macros that disguise as intrinsics
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
    using SSE::c_general;

    constexpr std::size_t VectorAlignment = 16;

#if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
    static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
    static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
    static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
    static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
    static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
    static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
#endif

#ifdef Vc_GCC
    // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
    // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
    static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
    static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
    static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
    static Vc_INTRINSIC Vc_CONST __m128  _mm_mul_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
    static Vc_INTRINSIC Vc_CONST __m128  _mm_add_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
    static Vc_INTRINSIC Vc_CONST __m128  _mm_sub_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
#endif

    static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
    static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
    static Vc_INTRINSIC Vc_CONST __m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }

    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 ()  { return _mm_set1_epi8(1); }
    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 ()  { return _mm_setone_epi8(); }
    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16()  { return _mm_setone_epi16(); }
    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
    static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32()  { return _mm_setone_epi32(); }

    static Vc_INTRINSIC __m128  Vc_CONST _mm_setone_ps()     { return _mm_load_ps(c_general::oneFloat); }
    static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd()     { return _mm_load_pd(c_general::oneDouble); }

    static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
    static Vc_INTRINSIC __m128  Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
    static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
    static Vc_INTRINSIC __m128  Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }

    static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
    static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
    static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
    static Vc_INTRINSIC __m128i Vc_CONST setmin_epi64() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskDouble)); }

#if defined(Vc_IMPL_XOP)
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu8(__m128i a, __m128i b) { return _mm_comlt_epu8(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu64(__m128i a, __m128i b) { return _mm_comgt_epu64(a, b); }
#else
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu8(__m128i a, __m128i b)
    {
        return _mm_cmplt_epi8(_mm_xor_si128(a, setmin_epi8()),
                              _mm_xor_si128(b, setmin_epi8()));
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
    {
        return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
                              _mm_xor_si128(b, setmin_epi8()));
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
    {
        return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
                               _mm_xor_si128(b, setmin_epi16()));
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
    {
        return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
                               _mm_xor_si128(b, setmin_epi16()));
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
    {
        return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
                               _mm_xor_si128(b, setmin_epi32()));
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
    {
        return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
                               _mm_xor_si128(b, setmin_epi32()));
    }
    Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
    {
#ifdef Vc_IMPL_SSE4_2
        return _mm_cmpgt_epi64(a, b);
#else
        const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
        const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
        const auto gt = _mm_cmpgt_epi32(aa, bb);
        const auto eq = _mm_cmpeq_epi32(aa, bb);
        // Algorithm:
        // 1. if the high 32 bits of gt are true, make the full 64 bits true
        // 2. if the high 32 bits of gt are false and the high 32 bits of eq are true,
        //    duplicate the low 32 bits of gt to the high 32 bits (note that this requires
        //    unsigned compare on the lower 32 bits, which is the reason for the xors
        //    above)
        // 3. else make the full 64 bits false

        const auto gt2 =
            _mm_shuffle_epi32(gt, 0xf5);  // dup the high 32 bits to the low 32 bits
        const auto lo =
            _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
        return _mm_or_si128(gt2, lo);
#endif
    }
    static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu64(__m128i a, __m128i b)
    {
        return cmpgt_epi64(_mm_xor_si128(a, setmin_epi64()),
                           _mm_xor_si128(b, setmin_epi64()));
    }
#endif
}  // namespace SseIntrinsics
}  // namespace Vc

// SSSE3
#ifdef Vc_IMPL_SSSE3
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
    // not overriding _mm_set1_epi8 because this one should only be used for non-constants
    Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
    Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
    Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
    template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
    {
        return _mm_alignr_epi8(a, b, s & 0x1fu);
    }
}  // namespace SseIntrinsics
}  // namespace Vc

#else

namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
    Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
        __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
        return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative,  _mm_setone_epi8()));
    }
    // positive value:
    //   negative == 0
    //   a unchanged after xor
    //   0 >> 31 -> 0
    //   a + 0 -> a
    // negative value:
    //   negative == -1
    //   a xor -1 -> -a - 1
    //   -1 >> 31 -> 1
    //   -a - 1 + 1 -> -a
    Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
        __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
        return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
    }
    Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
        __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
        return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
    }
    template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
    {
        switch (s & 0x1fu) {
            case  0: return b;
            case  1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b,  1));
            case  2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b,  2));
            case  3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b,  3));
            case  4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b,  4));
            case  5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b,  5));
            case  6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b,  6));
            case  7: return _mm_or_si128(_mm_slli_si128(a,  9), _mm_srli_si128(b,  7));
            case  8: return _mm_or_si128(_mm_slli_si128(a,  8), _mm_srli_si128(b,  8));
            case  9: return _mm_or_si128(_mm_slli_si128(a,  7), _mm_srli_si128(b,  9));
            case 10: return _mm_or_si128(_mm_slli_si128(a,  6), _mm_srli_si128(b, 10));
            case 11: return _mm_or_si128(_mm_slli_si128(a,  5), _mm_srli_si128(b, 11));
            case 12: return _mm_or_si128(_mm_slli_si128(a,  4), _mm_srli_si128(b, 12));
            case 13: return _mm_or_si128(_mm_slli_si128(a,  3), _mm_srli_si128(b, 13));
            case 14: return _mm_or_si128(_mm_slli_si128(a,  2), _mm_srli_si128(b, 14));
            case 15: return _mm_or_si128(_mm_slli_si128(a,  1), _mm_srli_si128(b, 15));
            case 16: return a;
            case 17: return _mm_srli_si128(a,  1);
            case 18: return _mm_srli_si128(a,  2);
            case 19: return _mm_srli_si128(a,  3);
            case 20: return _mm_srli_si128(a,  4);
            case 21: return _mm_srli_si128(a,  5);
            case 22: return _mm_srli_si128(a,  6);
            case 23: return _mm_srli_si128(a,  7);
            case 24: return _mm_srli_si128(a,  8);
            case 25: return _mm_srli_si128(a,  9);
            case 26: return _mm_srli_si128(a, 10);
            case 27: return _mm_srli_si128(a, 11);
            case 28: return _mm_srli_si128(a, 12);
            case 29: return _mm_srli_si128(a, 13);
            case 30: return _mm_srli_si128(a, 14);
            case 31: return _mm_srli_si128(a, 15);
        }
        return _mm_setzero_si128();
    }
}  // namespace SseIntrinsics
}  // namespace Vc
#endif

// SSE4.1
#ifdef Vc_IMPL_SSE4_1
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
{
    return _mm_cmpeq_epi64(a, b);
}
template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
{
    return _mm_extract_epi32(v, index);
}
Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
{
    return _mm_blendv_pd(a, b, c);
}
Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
{
    return _mm_blendv_ps(a, b, c);
}
Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
{
    return _mm_blendv_epi8(a, b, c);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
{
    return _mm_blend_pd(a, b, mask);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
{
    return _mm_blend_ps(a, b, mask);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
{
    return _mm_blend_epi16(a, b, mask);
}
Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
{
    return _mm_max_epi8(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
{
    return _mm_max_epi32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
{
    return _mm_max_epu16(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
{
    return _mm_max_epu32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
{
    return _mm_min_epu16(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
{
    return _mm_min_epu32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
{
    return _mm_min_epi8(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
{
    return _mm_min_epi32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
{
    return _mm_cvtepu8_epi16(epu8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
{
    return _mm_cvtepi8_epi16(epi8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
{
    return _mm_cvtepu16_epi32(epu16);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
{
    return _mm_cvtepi16_epi32(epu16);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
{
    return _mm_cvtepu8_epi32(epu8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
{
    return _mm_cvtepi8_epi32(epi8);
}
Vc_INTRINSIC Vc_PURE __m128i stream_load_si128(__m128i *mem)
{
    return _mm_stream_load_si128(mem);
}
}  // namespace SseIntrinsics
}  // namespace Vc
#else

namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
    Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
        auto tmp = _mm_cmpeq_epi32(a, b);
        return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
    }
    template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
    {
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
        typedef int int32v4 __attribute__((__vector_size__(16)));
        return reinterpret_cast<const MayAlias<int32v4> &>(v)[index];
#else
        return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
#endif
    }
    Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
        return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
    }
    Vc_INTRINSIC Vc_CONST __m128  blendv_ps(__m128  a, __m128  b, __m128  c) {
        return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
        return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
    }

    // only use the following blend functions with immediates as mask and, of course, compiling
    // with optimization
    template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
    {
        switch (mask) {
        case 0x0:
            return a;
        case 0x1:
            return _mm_shuffle_pd(b, a, 2);
        case 0x2:
            return _mm_shuffle_pd(a, b, 2);
        case 0x3:
            return b;
        default:
            abort();
            return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
        }
    }
    template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
    {
        __m128i c;
        switch (mask) {
        case 0x0:
            return a;
        case 0x1:
            c = _mm_srli_si128(_mm_setallone_si128(), 12);
            break;
        case 0x2:
            c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
            break;
        case 0x3:
            c = _mm_srli_si128(_mm_setallone_si128(), 8);
            break;
        case 0x4:
            c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
            break;
        case 0x5:
            c = _mm_set_epi32(0, -1, 0, -1);
            break;
        case 0x6:
            c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
            break;
        case 0x7:
            c = _mm_srli_si128(_mm_setallone_si128(), 4);
            break;
        case 0x8:
            c = _mm_slli_si128(_mm_setallone_si128(), 12);
            break;
        case 0x9:
            c = _mm_set_epi32(-1, 0, 0, -1);
            break;
        case 0xa:
            c = _mm_set_epi32(-1, 0, -1, 0);
            break;
        case 0xb:
            c = _mm_set_epi32(-1, 0, -1, -1);
            break;
        case 0xc:
            c = _mm_slli_si128(_mm_setallone_si128(), 8);
            break;
        case 0xd:
            c = _mm_set_epi32(-1, -1, 0, -1);
            break;
        case 0xe:
            c = _mm_slli_si128(_mm_setallone_si128(), 4);
            break;
        case 0xf:
            return b;
        default: // may not happen
            abort();
            c = _mm_setzero_si128();
            break;
        }
        __m128 _c = _mm_castsi128_ps(c);
        return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
    }
    template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
    {
        __m128i c;
        switch (mask) {
        case 0x00:
            return a;
        case 0x01:
            c = _mm_srli_si128(_mm_setallone_si128(), 14);
            break;
        case 0x03:
            c = _mm_srli_si128(_mm_setallone_si128(), 12);
            break;
        case 0x07:
            c = _mm_srli_si128(_mm_setallone_si128(), 10);
            break;
        case 0x0f:
            return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
        case 0x1f:
            c = _mm_srli_si128(_mm_setallone_si128(), 6);
            break;
        case 0x3f:
            c = _mm_srli_si128(_mm_setallone_si128(), 4);
            break;
        case 0x7f:
            c = _mm_srli_si128(_mm_setallone_si128(), 2);
            break;
        case 0x80:
            c = _mm_slli_si128(_mm_setallone_si128(), 14);
            break;
        case 0xc0:
            c = _mm_slli_si128(_mm_setallone_si128(), 12);
            break;
        case 0xe0:
            c = _mm_slli_si128(_mm_setallone_si128(), 10);
            break;
        case 0xf0:
            c = _mm_slli_si128(_mm_setallone_si128(), 8);
            break;
        case 0xf8:
            c = _mm_slli_si128(_mm_setallone_si128(), 6);
            break;
        case 0xfc:
            c = _mm_slli_si128(_mm_setallone_si128(), 4);
            break;
        case 0xfe:
            c = _mm_slli_si128(_mm_setallone_si128(), 2);
            break;
        case 0xff:
            return b;
        case 0xcc:
            return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
        case 0x33:
            return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
        default:
            const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
            c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
            break;
        }
        return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
    }

    Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
        return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
        return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
    }
//X         Vc_INTRINSIC Vc_CONST __m128i max_epu8 (__m128i a, __m128i b) {
//X             return _mm_blendv_epi8(b, a, cmpgt_epu8 (a, b));
//X         }
    Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
        return blendv_epi8(b, a, cmpgt_epu16(a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
        return blendv_epi8(b, a, cmpgt_epu32(a, b));
    }
//X         Vc_INTRINSIC Vc_CONST __m128i _mm_min_epu8 (__m128i a, __m128i b) {
//X             return _mm_blendv_epi8(a, b, cmpgt_epu8 (a, b));
//X         }
    Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
        return blendv_epi8(a, b, cmpgt_epu16(a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
        return blendv_epi8(a, b, cmpgt_epu32(a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
        return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
        return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
        return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
        return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
        return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
        return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
        return cvtepu16_epi32(cvtepu8_epi16(epu8));
    }
    Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
        const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
        const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
        return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
    }
    Vc_INTRINSIC Vc_PURE __m128i stream_load_si128(__m128i *mem) {
        return _mm_load_si128(mem);
    }
}  // namespace SseIntrinsics
}  // namespace Vc
#endif

// SSE4.2
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
    static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) {
        float f;
        switch (i) {
        case 0:
            f = _mm_cvtss_f32(v);
            break;
#if defined Vc_IMPL_SSE4_1 && !defined Vc_MSVC
        default:
#ifdef Vc_GCC
            f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
#else
            // MSVC fails to compile this because it can't optimize i to an immediate
            _MM_EXTRACT_FLOAT(f, v, i);
#endif
            break;
#else
        case 1:
            f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
            break;
        case 2:
            f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
            break;
        case 3:
            f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
            break;
#endif
        }
        return f;
    }
    static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) {
        if (i == 0) {
            return _mm_cvtsd_f64(v);
        }
        return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
    }
    static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) {
#ifdef Vc_GCC
        if (__builtin_constant_p(i)) {
            return extract_float_imm(v, i);
//X         if (index <= 1) {
//X             unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v));
//X             if (index == 0) tmp &= 0xFFFFFFFFull;
//X             if (index == 1) tmp >>= 32;
//X             return Common::AliasingEntryHelper<EntryType>(tmp);
//X         }
        } else {
            typedef float float4[4] Vc_MAY_ALIAS;
            const float4 &data = reinterpret_cast<const float4 &>(v);
            return data[i];
        }
#else
        union { __m128 v; float m[4]; } u;
        u.v = v;
        return u.m[i];
#endif
    }

    static Vc_INTRINSIC Vc_PURE __m128  _mm_stream_load(const float *mem) {
#ifdef Vc_IMPL_SSE4_1
        return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
#else
        return _mm_load_ps(mem);
#endif
    }
    static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
#ifdef Vc_IMPL_SSE4_1
        return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
#else
        return _mm_load_pd(mem);
#endif
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
#ifdef Vc_IMPL_SSE4_1
        return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
#else
        return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
#endif
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
        return _mm_stream_load(reinterpret_cast<const int *>(mem));
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
        return _mm_stream_load(reinterpret_cast<const int *>(mem));
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
        return _mm_stream_load(reinterpret_cast<const int *>(mem));
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
        return _mm_stream_load(reinterpret_cast<const int *>(mem));
    }
    static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
        return _mm_stream_load(reinterpret_cast<const int *>(mem));
    }

#ifndef __x86_64__
    Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
        return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
    }
#endif

}  // namespace SseIntrinsics
}  // namespace Vc

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
using namespace SseIntrinsics;

template <typename T> struct ParameterHelper
{
    typedef T ByValue;
    typedef T &Reference;
    typedef const T &ConstRef;
};

template <typename T> struct VectorHelper
{
};

template <typename T> struct VectorTypeHelper
{
    typedef __m128i Type;
};
template <> struct VectorTypeHelper<double>
{
    typedef __m128d Type;
};
template <> struct VectorTypeHelper<float>
{
    typedef __m128 Type;
};

template <typename T> struct DetermineGatherMask
{
    typedef T Type;
};

template <typename T> struct VectorTraits
{
    typedef typename VectorTypeHelper<T>::Type VectorType;
    using EntryType = typename Common::ensure_alignment_equals_sizeof<T>::type;
    static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
    enum Constants { HasVectorDivision = !std::is_integral<T>::value };
    typedef Mask<T> MaskType;
    typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
    typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
};

template <typename T> struct VectorHelperSize;
}  // namespace SSE
}  // namespace Vc

#if defined(Vc_GCC) && !defined(__OPTIMIZE__)
#pragma GCC diagnostic pop
#endif

/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_SHUFFLE_H_
#define VC_SSE_SHUFFLE_H_


namespace Vc_VERSIONED_NAMESPACE
{
    enum VecPos {
        X0, X1, X2, X3, X4, X5, X6, X7,
        Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
        Const0
    };

namespace Mem
{
        // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
        }

        // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
        template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
            static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
            return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
        }

        // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
        template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
        Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
        {
            return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
                                                                    _mm_castsi128_ps(y)));
        }

        // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
        template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
            static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
            static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
            return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
        }

        // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
            static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
            static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
            static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
            static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
            return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
                                               (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
        static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
            static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
            static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
            static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
            static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
            static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
            static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
            static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
            static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
            return Vc::SseIntrinsics::blend_epi16<
                (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
                (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
                (Dst7 / Y7) * 128>(x, y);
        }

        // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
            return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
            static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
            static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
            return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
            static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
            static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
            if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
                x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
            }
            if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
                x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
            }
            return x;
        }
}  // namespace Mem

    // The shuffles and permutes above use memory ordering. The ones below use register ordering:
namespace Reg
{
        // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
            return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
        }

        // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
            return Mem::shuffle<Dst0, Dst1>(x, y);
        }

        // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
        }

        // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
            return Mem::blend<Dst0, Dst1>(x, y);
        }

        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
            return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
        }
}  // namespace Reg
}  // namespace Vc

#endif // VC_SSE_SHUFFLE_H_

#endif // VC_SSE_INTRINSICS_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
using uint = unsigned int;
using ushort = unsigned short;
using uchar = unsigned char;
using schar = signed char;

// sse_cast {{{1
template <typename To, typename From> Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v)
{
    return v;
}
template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128  v) { return _mm_castps_si128(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128  sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v);    }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128  sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128  v) { return _mm_castps_pd(v);    }

// convert {{{1
template <typename From, typename To> struct ConvertTag
{
};
template <typename From, typename To>
Vc_INTRINSIC typename VectorTraits<To>::VectorType convert(
    typename VectorTraits<From>::VectorType v)
{
    return convert(v, ConvertTag<From, To>());
}

Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , int   >) { return _mm_cvttps_epi32(v); }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int   >) { return _mm_cvttpd_epi32(v); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , int   >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , int   >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int   >) { return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int   >) { return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16); }
Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , uint  >) {
    return _mm_castps_si128(
        blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
                  _mm_castsi128_ps(_mm_xor_si128(
                      _mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))),
                      _mm_set1_epi32(1 << 31))),
                  _mm_cmpge_ps(v, _mm_set1_ps(1u << 31))));
}
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint  >) {
#ifdef Vc_IMPL_SSE4_1
    return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))),
                         _mm_cvtsi64_si128(0x8000000080000000ull));
#else
    return blendv_epi8(_mm_cvttpd_epi32(v),
                       _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))),
                                     _mm_cvtsi64_si128(0x8000000080000000ull)),
                       _mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u))));
#endif
}
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , uint  >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , uint  >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint  >) { return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint  >) { return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16); }
Vc_INTRINSIC __m128  convert(__m128  v, ConvertTag<float , float >) { return v; }
Vc_INTRINSIC __m128  convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
Vc_INTRINSIC __m128  convert(__m128i v, ConvertTag<int   , float >) { return _mm_cvtepi32_ps(v); }
Vc_INTRINSIC __m128  convert(__m128i v, ConvertTag<uint  , float >) {
    // see AVX::convert<uint, float> for an explanation of the math behind the
    // implementation
    using namespace SSE;
    return blendv_ps(_mm_cvtepi32_ps(v),
        _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))),
                      _mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128(
                                                          v, _mm_set1_epi32(0x000001ff))))),
        _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())));
}
Vc_INTRINSIC __m128  convert(__m128i v, ConvertTag<short , float >) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, float>()); }
Vc_INTRINSIC __m128  convert(__m128i v, ConvertTag<ushort, float >) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, float>()); }
Vc_INTRINSIC __m128d convert(__m128  v, ConvertTag<float , double>) { return _mm_cvtps_pd(v); }
Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag<double, double>) { return v; }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<int   , double>) { return _mm_cvtepi32_pd(v); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<uint  , double>) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , short >) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , short >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, short >) { return v; }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, short >) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , ushort>) {
    auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());  // 0 4 X X 1 5 X X
    auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());  // 2 6 X X 3 7 X X
    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);              // 0 2 4 6 X X X X
    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);              // 1 3 5 7 X X X X
    return _mm_unpacklo_epi16(tmp2, tmp3);                   // 0 1 2 3 4 5 6 7
}
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , ushort>) {
    auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());  // 0 4 X X 1 5 X X
    auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());  // 2 6 X X 3 7 X X
    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);              // 0 2 4 6 X X X X
    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);              // 1 3 5 7 X X X X
    return _mm_unpacklo_epi16(tmp2, tmp3);                   // 0 1 2 3 4 5 6 7
}
Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , ushort>) { return convert(_mm_cvttps_epi32(v), ConvertTag<int, ushort>()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , ushort>) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, ushort>) { return v; }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, ushort>()); }

// }}}1
}  // namespace SSE
}  // namespace Vc

#endif // VC_SSE_CASTS_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_SHUFFLE_H_
#define VC_AVX_SHUFFLE_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <int... Dst> struct Permutation {};
template <uint8_t... Sel> struct Mask {};

#ifdef Vc_IMPL_AVX2
template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
          uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
          uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
          uint8_t Sel15>
Vc_INTRINSIC Vc_CONST __m256i
blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
                                 Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
{
    static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
                      (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
                      (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
                      (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
                      (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
                      (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
                      (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
                      (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
                  "Selectors must be 0 or 1 to select the value from a or b");
    constexpr uint8_t mask = static_cast<uint8_t>(
        (Sel0  << 0 ) | (Sel1  << 1 ) | (Sel2  << 2 ) | (Sel3  << 3 ) |
        (Sel4  << 4 ) | (Sel5  << 5 ) | (Sel6  << 6 ) | (Sel7  << 7 ) |
        (Sel8  << 8 ) | (Sel9  << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
        (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
    return _mm256_blend_epi16(a, b, mask);
}
#endif  // Vc_IMPL_AVX2
}  // namespace Detail
namespace Mem
{
#ifdef Vc_IMPL_AVX2
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
            static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
            static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
            return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
        }
#endif  // Vc_IMPL_AVX2

        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
            return _mm256_permute2f128_ps(
                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
        }
        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
            return _mm256_permute2f128_pd(
                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
        }
        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
            return _mm256_permute2x128_si256(
                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#else
            return _mm256_permute2f128_si256(
                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#endif
        }
        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
            return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
        }
        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
            return _mm256_permute2x128_si256(
                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
            return _mm256_permute2f128_si256(
                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
        }
        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
            return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
            return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
        }
#ifdef Vc_IMPL_AVX2
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }
#endif  // Vc_IMPL_AVX2
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
            static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
        static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
            static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
            static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
            static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
            static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
            static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
            static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
            static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
            static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
            return _mm256_blend_ps(x, y,
                    (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
                    (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
                    (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
                    (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
                    );
        }
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
        static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
            return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
        }
        template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
        static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
            static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
            static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
            static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
            static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
            static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
            static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
            static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
            static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
            if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
                return permute<Dst0, Dst1, Dst2, Dst3>(x);
            }
            const __m128 loIn = _mm256_castps256_ps128(x);
            const __m128 hiIn = _mm256_extractf128_ps(x, 1);
            __m128 lo, hi;

            if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
                lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
            } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
                lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
            } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
                lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
            } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
                lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
            } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
                lo = _mm_unpacklo_ps(loIn, hiIn);
            } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
                lo = _mm_unpacklo_ps(hiIn, loIn);
            } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
                lo = _mm_unpackhi_ps(loIn, hiIn);
            } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
                lo = _mm_unpackhi_ps(hiIn, loIn);
            } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
                lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
                   ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
            }

            if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
                hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
            } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
                hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
            } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
                hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
            } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
                hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
            } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
                hi = _mm_unpacklo_ps(loIn, hiIn);
            } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
                hi = _mm_unpacklo_ps(hiIn, loIn);
            } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
                hi = _mm_unpackhi_ps(loIn, hiIn);
            } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
                hi = _mm_unpackhi_ps(hiIn, loIn);
            } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
                hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
                   ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
            }

            return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
        }
}  // namespace Mem
}  // namespace Vc

    // little endian has the lo bits on the right and high bits on the left
    // with vectors this becomes greatly confusing:
    // Mem: abcd
    // Reg: dcba
    //
    // The shuffles and permutes above use memory ordering. The ones below use register ordering:
namespace Vc_VERSIONED_NAMESPACE
{
namespace Reg
{
        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
            return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
        }
        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
            return _mm256_permute2x128_si256(
                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
            return _mm256_permute2f128_si256(
                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
        }
        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
            return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
        }
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
        }
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }
        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
            return _mm_permute_pd(x, Dst0 + Dst1 * 2);
        }
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
            return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
            static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
            static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
        }
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
            return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
        }
}  // namespace Reg
}  // namespace Vc

#endif // VC_AVX_SHUFFLE_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
namespace Casts
{
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128  v) Vc_INTRINSIC_R;
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256  v) Vc_INTRINSIC_R;
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;

    // 128 -> 128
    template<> Vc_INTRINSIC __m128  avx_cast(__m128  v) { return v; }
    template<> Vc_INTRINSIC __m128  avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
    template<> Vc_INTRINSIC __m128  avx_cast(__m128d v) { return _mm_castpd_ps(v); }
    template<> Vc_INTRINSIC __m128i avx_cast(__m128  v) { return _mm_castps_si128(v); }
    template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
    template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m128  v) { return _mm_castps_pd(v); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }

    // 128 -> 256
    // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
    // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
    // upper 128bits are zero. Thus using the same register as AVX register will have the upper
    // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
    // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
    // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
    // do we really want to rely on specific compiler behavior here?
    template<> Vc_INTRINSIC __m256  avx_cast(__m128  v) { return _mm256_castps128_ps256(v); }
    template<> Vc_INTRINSIC __m256  avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
    template<> Vc_INTRINSIC __m256  avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
    template<> Vc_INTRINSIC __m256i avx_cast(__m128  v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
    template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
    template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m128  v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }

#if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
    static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_permute2f128_ps   (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
    static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
    static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd   (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
#else
    static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_castps128_ps256(v); }
    static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
    static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
#endif

    // 256 -> 128
    template<> Vc_INTRINSIC __m128  avx_cast(__m256  v) { return _mm256_castps256_ps128(v); }
    template<> Vc_INTRINSIC __m128  avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
    template<> Vc_INTRINSIC __m128  avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
    template<> Vc_INTRINSIC __m128i avx_cast(__m256  v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
    template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
    template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m256  v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
    template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }

    // 256 -> 256
    template<> Vc_INTRINSIC __m256  avx_cast(__m256  v) { return v; }
    template<> Vc_INTRINSIC __m256  avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
    template<> Vc_INTRINSIC __m256  avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
    template<> Vc_INTRINSIC __m256i avx_cast(__m256  v) { return _mm256_castps_si256(v); }
    template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
    template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m256  v) { return _mm256_castps_pd(v); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
    template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }

    // simplify splitting 256-bit registers in 128-bit registers
    Vc_INTRINSIC Vc_CONST __m128  lo128(__m256  v) { return avx_cast<__m128>(v); }
    Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
    Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
    Vc_INTRINSIC Vc_CONST __m128  hi128(__m256  v) { return extract128<1>(v); }
    Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
    Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }

    // simplify combining 128-bit registers in 256-bit registers
    Vc_INTRINSIC Vc_CONST __m256  concat(__m128  a, __m128  b) { return insert128<1>(avx_cast<__m256 >(a), b); }
    Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
    Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }

}  // namespace Casts
using namespace Casts;
}  // namespace AVX

namespace AVX2
{
using namespace AVX::Casts;
}  // namespace AVX2

namespace AVX
{
template <typename From, typename To> struct ConvertTag {};

Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
#ifdef Vc_IMPL_AVX2
    return _mm256_cvtepi16_epi32(v);
#else
    return AVX::srai_epi32<16>(
        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
#ifdef Vc_IMPL_AVX2
    return _mm256_cvtepu16_epi32(v);
#else
    return AVX::srli_epi32<16>(
        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}

Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , uint>) {
    using namespace AVX;
    return _mm256_castps_si256(_mm256_blendv_ps(
        _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
        _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
                                      set2power31_epu32())),
        cmpge_ps(v, set2power31_ps())));
}
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
    using namespace AVX;
    return _mm_xor_si128(
        _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
        _mm_set2power31_epu32());
}
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
#ifdef Vc_IMPL_AVX2
    return _mm256_cvtepi16_epi32(v);
#else
    return AVX::srai_epi32<16>(
        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
#ifdef Vc_IMPL_AVX2
    return _mm256_cvtepu16_epi32(v);
#else
    return AVX::srli_epi32<16>(
        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}

Vc_INTRINSIC __m256  convert(__m256  v, ConvertTag<float , float>) { return v; }
Vc_INTRINSIC __m128  convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<int   , float>) { return _mm256_cvtepi32_ps(v); }
Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<uint  , float>) {
    // this is complicated because cvtepi32_ps only supports signed input. Thus, all
    // input values with the MSB set would produce a negative result. We can reuse the
    // cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be
    // different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB
    // determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7]
    // need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80
    // then the rounding direction is determined by bit [8] for round to even. That's why
    // the 9th bit is relevant for the rounding decision.)
    // If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding
    // decision on the lowest 8 bits instead. A second rounding decision is made when
    // float(0x8000'0000) is added. This will rarely fix the rounding issue.
    //
    // Here's what the standard rounding mode expects:
    // 0xc0000080 should cvt to 0xc0000000
    // 0xc0000081 should cvt to 0xc0000100
    //     --     should cvt to 0xc0000100
    // 0xc000017f should cvt to 0xc0000100
    // 0xc0000180 should cvt to 0xc0000200
    //
    // However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get:
    // 0xc0000081 would cvt to 0xc0000000
    // 0xc00000c0 would cvt to 0xc0000000
    // 0xc00000c1 would cvt to 0xc0000100
    // 0xc000013f would cvt to 0xc0000100
    // 0xc0000140 would cvt to 0xc0000200
    //
    // Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff))
    // This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is
    // added to the float value of the low 8 bits of the input.
    using namespace AVX;
    return _mm256_blendv_ps(
        _mm256_cvtepi32_ps(v),
        _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
                      _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
                                                          v, set1_epi32(0x000001ff))))),
        _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
}
Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }

Vc_INTRINSIC __m256d convert(__m128  v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int   , double>) { return _mm256_cvtepi32_pd(v); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint  , double>) {
    using namespace AVX;
    return _mm256_add_pd(
        _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
        set1_pd(1u << 31)); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }

Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , short>) {
    const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
    const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
    const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
    const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
    return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , short>) {
    const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
    const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
    const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
    const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
    return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }

Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , ushort>) {
    auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
    auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
    return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , ushort>) {
    auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
    auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
    return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }

template <typename From, typename To>
Vc_INTRINSIC auto convert(
    typename std::conditional<(sizeof(From) < sizeof(To)),
                              typename SSE::VectorTraits<From>::VectorType,
                              typename AVX::VectorTypeHelper<From>::Type>::type v)
    -> decltype(convert(v, ConvertTag<From, To>()))
{
    return convert(v, ConvertTag<From, To>());
}

template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
    -> decltype(convert(lo128(v), ConvertTag<From, To>()))
{
    return convert(lo128(v), ConvertTag<From, To>());
}
}  // namespace AVX
}  // namespace Vc

#endif // VC_AVX_CASTS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_VECTOR_H_
#define VC_SSE_VECTOR_H_

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_VECTORHELPER_H_
#define VC_SSE_VECTORHELPER_H_

#include <limits>

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
#define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
#define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
#define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
#define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }

        template<> struct VectorHelper<_M128>
        {
            typedef _M128 VectorType;

            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned  = nullptr) { return _mm_load_ps(x); }
            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }

            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }

            // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }

            Vc_OP0(allone, _mm_setallone_ps())
            Vc_OP0(zero, _mm_setzero_ps())
            Vc_OP3(blend, blendv_ps(a, b, c))
        };


        template<> struct VectorHelper<_M128D>
        {
            typedef _M128D VectorType;

            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_pd(x); }
            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
            template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }

            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }

            // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }

            Vc_OP0(allone, _mm_setallone_pd())
            Vc_OP0(zero, _mm_setzero_pd())
            Vc_OP3(blend, blendv_pd(a, b, c))
        };

        template<> struct VectorHelper<_M128I>
        {
            typedef _M128I VectorType;

            template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }

            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }

            // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }

            Vc_OP0(allone, _mm_setallone_si128())
            Vc_OP0(zero, _mm_setzero_si128())
            Vc_OP3(blend, blendv_epi8(a, b, c))
        };

#undef Vc_OP1
#undef Vc_OP2
#undef Vc_OP3

#define Vc_OP1(op) \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
#define Vc_OP(op) \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
#define Vc_OP_(op) \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op    , Vc_SUFFIX)(a, b); }
#define Vc_OPx(op, op2) \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
#define Vc_OP_CAST_(op) \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
            _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
              Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
        }
#define Vc_MINMAX \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
        static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }

        template<> struct VectorHelper<double> {
            typedef _M128D VectorType;
            typedef double EntryType;
#define Vc_SUFFIX pd

            Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.); }

#ifdef Vc_IMPL_FMA4
            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
                v1 = _mm_macc_pd(v1, v2, v3);
            }
#else
            static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
                VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
                VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
#if defined(Vc_GCC) && Vc_GCC < 0x40703
                // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
                // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
                asm("":"+x"(h1), "+x"(h2));
#endif
                const VectorType l1 = _mm_sub_pd(v1, h1);
                const VectorType l2 = _mm_sub_pd(v2, h2);
                const VectorType ll = mul(l1, l2);
                const VectorType lh = add(mul(l1, h2), mul(h1, l2));
                const VectorType hh = mul(h1, h2);
                // ll < lh < hh for all entries is certain
                const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
                const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
                const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
                v1 = add(add(ll, b), add(c, hh));
            }
#endif

            Vc_OP(add) Vc_OP(sub) Vc_OP(mul)

            Vc_OP1(sqrt)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
                return _mm_div_pd(one(), sqrt(x));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
                return _mm_div_pd(one(), x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
                return _mm_cmpunord_pd(x, x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
                return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
                return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
                return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
            }

            Vc_MINMAX
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
                return _mm_cvtsd_f64(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
                return _mm_cvtsd_f64(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
                return _mm_cvtsd_f64(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
                return _mm_cvtsd_f64(a);
            }
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
#ifdef Vc_IMPL_SSE4_1
                return _mm_round_pd(a, _MM_FROUND_NINT);
#else
                //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
                return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
#endif
            }
        };

        template<> struct VectorHelper<float> {
            typedef float EntryType;
            typedef _M128 VectorType;
#define Vc_SUFFIX ps

            Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.f); }
            static Vc_ALWAYS_INLINE Vc_CONST _M128 concat(_M128D a, _M128D b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }

#ifdef Vc_IMPL_FMA4
            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
                v1 = _mm_macc_ps(v1, v2, v3);
            }
#else
            static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
                __m128d v1_0 = _mm_cvtps_pd(v1);
                __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
                __m128d v2_0 = _mm_cvtps_pd(v2);
                __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
                __m128d v3_0 = _mm_cvtps_pd(v3);
                __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
                v1 = _mm_movelh_ps(
                        _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
                        _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
            }
#endif

            Vc_OP(add) Vc_OP(sub) Vc_OP(mul)

            Vc_OP1(sqrt) Vc_OP1(rsqrt)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
                return _mm_cmpunord_ps(x, x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
                return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
                return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
                return _mm_rcp_ps(x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
                return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
            }

            Vc_MINMAX
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                a = _mm_min_ps(a, _mm_movehl_ps(a, a));   // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
                a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
                return _mm_cvtss_f32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                a = _mm_max_ps(a, _mm_movehl_ps(a, a));   // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
                a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
                return _mm_cvtss_f32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
                a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
                return _mm_cvtss_f32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
                a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
                return _mm_cvtss_f32(a);
            }
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
#ifdef Vc_IMPL_SSE4_1
                return _mm_round_ps(a, _MM_FROUND_NINT);
#else
                //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
                return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
#endif
            }
        };

        template<> struct VectorHelper<int> {
            typedef int EntryType;
            typedef _M128I VectorType;
#define Vc_SUFFIX si128

            Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
#undef Vc_SUFFIX
#define Vc_SUFFIX epi32
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }

            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
                return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
                return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
#ifdef Vc_IMPL_SSE4_1
            static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
#else
            static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
                const VectorType aShift = _mm_srli_si128(a, 4);
                const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
                const VectorType bShift = _mm_srli_si128(b, 4);
                const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
                return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
            }
#endif

            Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
        };

        template<> struct VectorHelper<unsigned int> {
            typedef unsigned int EntryType;
            typedef _M128I VectorType;
#define Vc_SUFFIX si128
            Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }

#undef Vc_SUFFIX
#define Vc_SUFFIX epu32
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                // using lo_epi16 for speed here
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                return _mm_cvtsi128_si32(a);
            }

            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
                return VectorHelper<int>::mul(a, b);
            }
//X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
//X                 switch (b) {
//X                     case    0: return zero();
//X                     case    1: return a;
//X                     case    2: return _mm_slli_epi32(a,  1);
//X                     case    4: return _mm_slli_epi32(a,  2);
//X                     case    8: return _mm_slli_epi32(a,  3);
//X                     case   16: return _mm_slli_epi32(a,  4);
//X                     case   32: return _mm_slli_epi32(a,  5);
//X                     case   64: return _mm_slli_epi32(a,  6);
//X                     case  128: return _mm_slli_epi32(a,  7);
//X                     case  256: return _mm_slli_epi32(a,  8);
//X                     case  512: return _mm_slli_epi32(a,  9);
//X                     case 1024: return _mm_slli_epi32(a, 10);
//X                     case 2048: return _mm_slli_epi32(a, 11);
//X                 }
//X                 return mul(a, set(b));
//X             }

#undef Vc_SUFFIX
#define Vc_SUFFIX epi32
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
                return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
                return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }

            Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
        };

        template<> struct VectorHelper<signed short> {
            typedef _M128I VectorType;
            typedef signed short EntryType;
#define Vc_SUFFIX si128

            Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
            static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
            static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }

#undef Vc_SUFFIX
#define Vc_SUFFIX epi16
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
                return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
                return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
                    const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
                return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
            }

            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
                v1 = add(mul(v1, v2), v3); }

            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }

            Vc_OPx(mul, mullo)
            Vc_OP(min) Vc_OP(max)
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }

            Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
        };

        template<> struct VectorHelper<unsigned short> {
            typedef _M128I VectorType;
            typedef unsigned short EntryType;
#define Vc_SUFFIX si128
            Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
#ifdef Vc_IMPL_SSE4_1
            static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packus_epi32(a, b); }
#else
            // FIXME too bad, but this is broken without SSE 4.1
            static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) {
                auto tmp0 = _mm_unpacklo_epi16(a, b); // 0 4 X X 1 5 X X
                auto tmp1 = _mm_unpackhi_epi16(a, b); // 2 6 X X 3 7 X X
                auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // 0 2 4 6 X X X X
                auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // 1 3 5 7 X X X X
                return _mm_unpacklo_epi16(tmp2, tmp3); // 0 1 2 3 4 5 6 7
            }
#endif
            static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
            static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }

#undef Vc_SUFFIX
#define Vc_SUFFIX epu16
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }

//X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
//X                 switch (b) {
//X                     case    0: return zero();
//X                     case    1: return a;
//X                     case    2: return _mm_slli_epi16(a,  1);
//X                     case    4: return _mm_slli_epi16(a,  2);
//X                     case    8: return _mm_slli_epi16(a,  3);
//X                     case   16: return _mm_slli_epi16(a,  4);
//X                     case   32: return _mm_slli_epi16(a,  5);
//X                     case   64: return _mm_slli_epi16(a,  6);
//X                     case  128: return _mm_slli_epi16(a,  7);
//X                     case  256: return _mm_slli_epi16(a,  8);
//X                     case  512: return _mm_slli_epi16(a,  9);
//X                     case 1024: return _mm_slli_epi16(a, 10);
//X                     case 2048: return _mm_slli_epi16(a, 11);
//X                 }
//X                 return mul(a, set(b));
//X             }
#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
            static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
#endif
#undef Vc_SUFFIX
#define Vc_SUFFIX epi16
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
                return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
                return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
            }

            static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }

            Vc_OPx(mul, mullo) // should work correctly for all values
#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
            Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set
#endif
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
                // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
                a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
                a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
                return _mm_cvtsi128_si32(a); // & 0xffff is implicit
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
                    const EntryType d, const EntryType e, const EntryType f,
                    const EntryType g, const EntryType h) {
                return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
            }

            Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
        };
#undef Vc_OP1
#undef Vc_OP
#undef Vc_OP_
#undef Vc_OPx
#undef Vc_OP_CAST_
#undef Vc_MINMAX

}  // namespace SSE
}  // namespace Vc


#endif // VC_SSE_VECTORHELPER_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_MASK_H_
#define VC_SSE_MASK_H_

/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_DETAIL_H_
#define VC_SSE_DETAIL_H_

#ifdef Vc_IMPL_AVX
#endif


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// (converting) load functions {{{1
template <typename V, typename DstT> struct LoadTag
{
};

// when_(un)aligned{{{2
class when_aligned
{
public:
    template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
    {
    }
};

class when_unaligned
{
public:
    template <typename F>
    constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
    {
    }
};

class when_streaming
{
public:
    template <typename F>
    constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
    {
    }
};

// load16{{{2
Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
{
    return _mm_load_ps(mem);
}
Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
{
    return _mm_loadu_ps(mem);
}
Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
{
    return SseIntrinsics::_mm_stream_load(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
{
    return _mm_load_pd(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
{
    return _mm_loadu_pd(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
{
    return SseIntrinsics::_mm_stream_load(mem);
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
{
    static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
    return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
{
    static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
{
    static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
    return SseIntrinsics::_mm_stream_load(mem);
}

// MSVC workarounds{{{2
#ifdef Vc_MSVC
// work around: "fatal error C1001: An internal error has occurred in the compiler."
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128d load(const double *mem, F f,
                          enable_if<(std::is_same<DstT, double>::value &&
                                     std::is_same<V, __m128d>::value)> = nullarg)
{
    return load16(mem, f);
}

template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128 load(const float *mem, F f,
                         enable_if<(std::is_same<DstT, float>::value &&
                                    std::is_same<V, __m128>::value)> = nullarg)
{
    return load16(mem, f);
}

template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const uint *mem, F f,
                          enable_if<(std::is_same<DstT, uint>::value &&
                                     std::is_same<V, __m128i>::value)> = nullarg)
{
    return load16(mem, f);
}

template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const int *mem, F f,
                          enable_if<(std::is_same<DstT, int>::value &&
                                     std::is_same<V, __m128i>::value)> = nullarg)
{
    return load16(mem, f);
}

template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const short *mem, F f,
                          enable_if<(std::is_same<DstT, short>::value &&
                                     std::is_same<V, __m128i>::value)> = nullarg)
{
    return load16(mem, f);
}

template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const ushort *mem, F f,
                          enable_if<(std::is_same<DstT, ushort>::value &&
                                     std::is_same<V, __m128i>::value)> = nullarg)
{
    return load16(mem, f);
}
#endif  // Vc_MSVC

// generic load{{{2
template <typename V, typename DstT, typename SrcT, typename Flags,
          typename = enable_if<
#ifdef Vc_MSVC
              !std::is_same<DstT, SrcT>::value &&
#endif
              (!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
               sizeof(DstT) >= sizeof(SrcT))>>
Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
{
    return load(mem, flags, LoadTag<V, DstT>());
}

// no conversion load from any T {{{2
template <typename V, typename T, typename Flags>
Vc_INTRINSIC V
    load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
{
    return SSE::VectorHelper<V>::template load<Flags>(mem);
}

// short {{{2
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
{
    return SSE::VectorHelper<__m128i>::load<Flags>(mem);
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
{
    // the only available streaming load loads 16 bytes - twice as much as we need =>
    // can't use it, or we risk an out-of-bounds read and an unaligned load exception
    return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
{
    // the only available streaming load loads 16 bytes - twice as much as we need =>
    // can't use it, or we risk an out-of-bounds read and an unaligned load exception
    return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}

// ushort {{{2
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
{
    // the only available streaming load loads 16 bytes - twice as much as we need =>
    // can't use it, or we risk an out-of-bounds read and an unaligned load exception
    return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}

// int {{{2
template <typename Flags>
Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
{
    return SSE::VectorHelper<__m128i>::load<Flags>(mem);
}
// no difference between streaming and alignment, because the
// 32/64 bit loads are not available as streaming loads, and can always be unaligned
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
{
    return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
{
    return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
{
    return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
{
    return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(mem)));
}

// uint {{{2
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
{
    return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
{
    return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(mem)));
}

// double {{{2
template <typename Flags>
Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<float, double>(
        _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<uint, double>(
        _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<int, double>(
        _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<ushort, double>(
        _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<short, double>(
        _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<uchar, double>(
        _mm_set1_epi16(*reinterpret_cast<const MayAlias<short> *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
{
    return SSE::convert<char, double>(
        _mm_set1_epi16(*reinterpret_cast<const MayAlias<short> *>(mem)));
}

// float {{{2
template <typename Flags>
Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
{
#ifdef Vc_IMPL_AVX
    if (Flags::IsUnaligned) {
        return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
    } else if (Flags::IsStreaming) {
        return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
    } else {  // Flags::IsAligned
        return _mm256_cvtpd_ps(_mm256_load_pd(mem));
    }
#else
    return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
                         _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
#endif
}
template <typename Flags>
Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
{
    return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
}
template <typename T, typename Flags,
          typename = enable_if<!std::is_same<T, float>::value>>
Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
{
    return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
}

// shifted{{{1
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
{
    return k;
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
{
    return _mm_srli_si128(k, amount);
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
{
    return _mm_slli_si128(k, -amount);
}

// IndexesFromZero{{{1
template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
{
    if (Size == 4) {
        return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
    } else if (Size == 8) {
        return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
    } else if (Size == 16) {
        return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
    }
    return 0;
}

// popcnt{{{1
Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(n);
#else
    n = (n & 0x5U) + ((n >> 1) & 0x5U);
    n = (n & 0x3U) + ((n >> 2) & 0x3U);
    return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(n);
#else
    n = (n & 0x55U) + ((n >> 1) & 0x55U);
    n = (n & 0x33U) + ((n >> 2) & 0x33U);
    n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
    return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(n);
#else
    n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
    n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
    n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
    n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
    return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(n);
#else
    n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
    n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
    n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
    n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
    n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
    return n;
#endif
}

// mask_cast{{{1
template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
{
    static_assert(From == To, "Incorrect mask cast.");
    static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
    return SSE::sse_cast<__m128>(k);
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(
        _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
{
    const auto tmp = _mm_unpacklo_epi16(k, k);
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
{
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
{
    const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
{
    const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
    return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
}

// allone{{{1
template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
template<> Vc_INTRINSIC Vc_CONST __m128  allone<__m128 >() { return SSE::_mm_setallone_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }

// zero{{{1
template <typename V> inline V zero();
template<> Vc_INTRINSIC Vc_CONST __m128  zero<__m128 >() { return _mm_setzero_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }

// negate{{{1
Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
{
    return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
}
Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
{
    return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
}
Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
{
#ifdef Vc_IMPL_SSSE3
    return _mm_sign_epi32(v, allone<__m128i>());
#else
    return _mm_sub_epi32(_mm_setzero_si128(), v);
#endif
}
Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
{
#ifdef Vc_IMPL_SSSE3
    return _mm_sign_epi16(v, allone<__m128i>());
#else
    return _mm_sub_epi16(_mm_setzero_si128(), v);
#endif
}

// xor_{{{1
Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }

// or_{{{1
Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }

// and_{{{1
Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }

// andnot_{{{1
Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }

// not_{{{1
Vc_INTRINSIC __m128  not_(__m128  a) { return andnot_(a, allone<__m128 >()); }
Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }

// add{{{1
Vc_INTRINSIC __m128  add(__m128  a, __m128  b,  float) { return _mm_add_ps(a, b); }
Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b,    int) { return _mm_add_epi32(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b,   uint) { return _mm_add_epi32(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  short) { return _mm_add_epi16(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  schar) { return _mm_add_epi8 (a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  uchar) { return _mm_add_epi8 (a, b); }

// sub{{{1
Vc_INTRINSIC __m128  sub(__m128  a, __m128  b,  float) { return _mm_sub_ps(a, b); }
Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,    int) { return _mm_sub_epi32(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,   uint) { return _mm_sub_epi32(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  short) { return _mm_sub_epi16(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  schar) { return _mm_sub_epi8 (a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  uchar) { return _mm_sub_epi8 (a, b); }

// mul{{{1
Vc_INTRINSIC __m128  mul(__m128  a, __m128  b,  float) { return _mm_mul_ps(a, b); }
Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,    int) {
#ifdef Vc_IMPL_SSE4_1
    return _mm_mullo_epi32(a, b);
#else
    const __m128i aShift = _mm_srli_si128(a, 4);
    const __m128i ab02 = _mm_mul_epu32(a, b);  // [a0 * b0, a2 * b2]
    const __m128i bShift = _mm_srli_si128(b, 4);
    const __m128i ab13 = _mm_mul_epu32(aShift, bShift);  // [a1 * b1, a3 * b3]
    return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
#endif
}
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,   uint) { return mul(a, b, int()); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  short) { return _mm_mullo_epi16(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  schar) {
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
    using B = Common::BuiltinType<schar, 16>;
    const auto x = reinterpret_cast<const MayAlias<B> &>(a) *
                   reinterpret_cast<const MayAlias<B> &>(b);
    return reinterpret_cast<const __m128i &>(x);
#else
    return or_(
        and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
        _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
#endif
}
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  uchar) {
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
    using B = Common::BuiltinType<uchar, 16>;
    const auto x = reinterpret_cast<const MayAlias<B> &>(a) *
                   reinterpret_cast<const MayAlias<B> &>(b);
    return reinterpret_cast<const __m128i &>(x);
#else
    return or_(
        and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
        _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
#endif
}

// div{{{1
Vc_INTRINSIC __m128  div(__m128  a, __m128  b,  float) { return _mm_div_ps(a, b); }
Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }

// TODO: fma{{{1
//Vc_INTRINSIC __m128  fma(__m128  a, __m128  b, __m128  c,  float) { return _mm_mul_ps(a, b); }
//Vc_INTRINSIC __m128d fma(__m128d a, __m128d b, __m128d c, double) { return _mm_mul_pd(a, b); }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,    int) { }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,   uint) { return fma(a, b, int()); }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  short) { return _mm_mullo_epi16(a, b); }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, ushort) { return _mm_mullo_epi16(a, b); }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  schar) { }
//Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  uchar) { }

// min{{{1
Vc_INTRINSIC __m128  min(__m128  a, __m128  b,  float) { return _mm_min_ps(a, b); }
Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b,    int) { return SSE::min_epi32(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b,   uint) { return SSE::min_epu32(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  short) { return _mm_min_epi16(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  schar) { return SSE::min_epi8 (a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  uchar) { return _mm_min_epu8 (a, b); }

// max{{{1
Vc_INTRINSIC __m128  max(__m128  a, __m128  b,  float) { return _mm_max_ps(a, b); }
Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b,    int) { return SSE::max_epi32(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b,   uint) { return SSE::max_epu32(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  short) { return _mm_max_epi16(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  schar) { return SSE::max_epi8 (a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  uchar) { return _mm_max_epu8 (a, b); }

// horizontal add{{{1
Vc_INTRINSIC  float add(__m128  a,  float) {
    a = _mm_add_ps(a, _mm_movehl_ps(a, a));
    a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
    return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double add(__m128d a, double) {
    a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
    return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC    int add(__m128i a,    int) {
    a = add(a, _mm_srli_si128(a, 8), int());
    a = add(a, _mm_srli_si128(a, 4), int());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC   uint add(__m128i a,   uint) { return add(a, int()); }
Vc_INTRINSIC  short add(__m128i a,  short) {
    a = add(a, _mm_srli_si128(a, 8), short());
    a = add(a, _mm_srli_si128(a, 4), short());
    a = add(a, _mm_srli_si128(a, 2), short());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
Vc_INTRINSIC  schar add(__m128i a,  schar) {
    a = add(a, _mm_srli_si128(a, 8), schar());
    a = add(a, _mm_srli_si128(a, 4), schar());
    a = add(a, _mm_srli_si128(a, 2), schar());
    a = add(a, _mm_srli_si128(a, 1), schar());
    return _mm_cvtsi128_si32(a);  // & 0xff is implicit
}
Vc_INTRINSIC  uchar add(__m128i a,  uchar) { return add(a, schar()); }

// horizontal mul{{{1
Vc_INTRINSIC  float mul(__m128  a,  float) {
    a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
    a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
    return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double mul(__m128d a, double) {
    a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
    return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC    int mul(__m128i a,    int) {
    a = mul(a, _mm_srli_si128(a, 8), int());
    a = mul(a, _mm_srli_si128(a, 4), int());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC   uint mul(__m128i a,   uint) { return mul(a, int()); }
Vc_INTRINSIC  short mul(__m128i a,  short) {
    a = mul(a, _mm_srli_si128(a, 8), short());
    a = mul(a, _mm_srli_si128(a, 4), short());
    a = mul(a, _mm_srli_si128(a, 2), short());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
Vc_INTRINSIC  schar mul(__m128i a,  schar) {
    // convert to two short vectors, multiply them and then do horizontal reduction
    const __m128i s0 = _mm_srai_epi16(a, 1);
    const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
    return mul(mul(s0, s1, short()), short());
}
Vc_INTRINSIC  uchar mul(__m128i a,  uchar) { return mul(a, schar()); }

// horizontal min{{{1
Vc_INTRINSIC  float min(__m128  a,  float) {
    a = _mm_min_ps(a, _mm_movehl_ps(a, a));
    a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
    return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double min(__m128d a, double) {
    a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
    return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC    int min(__m128i a,    int) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC   uint min(__m128i a,   uint) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC  short min(__m128i a,  short) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC ushort min(__m128i a, ushort) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC  schar min(__m128i a,  schar) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
    return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
}
Vc_INTRINSIC  uchar min(__m128i a,  uchar) {
    a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
    return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
}

// horizontal max{{{1
Vc_INTRINSIC  float max(__m128  a,  float) {
    a = _mm_max_ps(a, _mm_movehl_ps(a, a));
    a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
    return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double max(__m128d a, double) {
    a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
    return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC    int max(__m128i a,    int) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC   uint max(__m128i a,   uint) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
    return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC  short max(__m128i a,  short) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC ushort max(__m128i a, ushort) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
    return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
}
Vc_INTRINSIC  schar max(__m128i a,  schar) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
    return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
}
Vc_INTRINSIC  uchar max(__m128i a,  uchar) {
    a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
    a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
    return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
}

// sorted{{{1
template <Vc::Implementation, typename T>
Vc_CONST_L SSE::Vector<T> sorted(SSE::Vector<T> x) Vc_CONST_R;
template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
{
    static_assert(!CurrentImplementation::is(ScalarImpl),
                  "Detail::sorted can only be instantiated if a non-Scalar "
                  "implementation is selected.");
    return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
               ? SSE2Impl
               : CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
                     ? SSE41Impl
                     : CurrentImplementation::current() > (x);
}

// sanitize{{{1
template <typename V> constexpr int sanitize(int n)
{
    return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
}

// rotated{{{1
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
{
    using namespace SSE;
    switch (static_cast<unsigned int>(amount) % N) {
    case 0:
        return v;
    case 1:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
    case 2:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
    case 3:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
    case 4:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
    case 5:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
    case 6:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
    case 7:
        return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
    }
    return sse_cast<V>(_mm_setzero_si128());
}

//InterleaveImpl{{{1
template<typename V, int Size, size_t VSize> struct InterleaveImpl;
template<typename V> struct InterleaveImpl<V, 8, 16> {
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
        const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
#ifdef __x86_64__
        const long long tmp00 = _mm_cvtsi128_si64(tmp0);
        const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
        const long long tmp10 = _mm_cvtsi128_si64(tmp1);
        const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
        *reinterpret_cast<MayAlias<int> *>(&data[i[0]]) = tmp00;
        *reinterpret_cast<MayAlias<int> *>(&data[i[1]]) = tmp00 >> 32;
        *reinterpret_cast<MayAlias<int> *>(&data[i[2]]) = tmp01;
        *reinterpret_cast<MayAlias<int> *>(&data[i[3]]) = tmp01 >> 32;
        *reinterpret_cast<MayAlias<int> *>(&data[i[4]]) = tmp10;
        *reinterpret_cast<MayAlias<int> *>(&data[i[5]]) = tmp10 >> 32;
        *reinterpret_cast<MayAlias<int> *>(&data[i[6]]) = tmp11;
        *reinterpret_cast<MayAlias<int> *>(&data[i[7]]) = tmp11 >> 32;
#elif defined(Vc_IMPL_SSE4_1)
        using namespace SseIntrinsics;
        *reinterpret_cast<MayAlias<int> *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
        *reinterpret_cast<MayAlias<int> *>(&data[i[1]]) = extract_epi32<1>(tmp0);
        *reinterpret_cast<MayAlias<int> *>(&data[i[2]]) = extract_epi32<2>(tmp0);
        *reinterpret_cast<MayAlias<int> *>(&data[i[3]]) = extract_epi32<3>(tmp0);
        *reinterpret_cast<MayAlias<int> *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
        *reinterpret_cast<MayAlias<int> *>(&data[i[5]]) = extract_epi32<1>(tmp1);
        *reinterpret_cast<MayAlias<int> *>(&data[i[6]]) = extract_epi32<2>(tmp1);
        *reinterpret_cast<MayAlias<int> *>(&data[i[7]]) = extract_epi32<3>(tmp1);
#else
        *reinterpret_cast<MayAlias<int> *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
        *reinterpret_cast<MayAlias<int> *>(&data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
        *reinterpret_cast<MayAlias<int> *>(&data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
        *reinterpret_cast<MayAlias<int> *>(&data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
        *reinterpret_cast<MayAlias<int> *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
        *reinterpret_cast<MayAlias<int> *>(&data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
        *reinterpret_cast<MayAlias<int> *>(&data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
        *reinterpret_cast<MayAlias<int> *>(&data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
#endif
    }/*}}}*/
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
        const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
        V(tmp0).store(&data[i[0]], Vc::Unaligned);
        V(tmp1).store(&data[i[4]], Vc::Unaligned);
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
    {
#if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
        // MSVC fails to compile the MMX intrinsics
        const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
        const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
        const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
        const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
        const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());

        const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
        const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
        const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
        const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);

        _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
        _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
        _mm_empty();
#else
        interleave(data, i, v0, v1);
        v2.scatter(data + 2, i);
#endif
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1,
            const typename V::AsArg v2, const typename V::AsArg v3)
    {
        const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
        const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
        const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
        const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());

        const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
        const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
        const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
        const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);

        _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
    }/*}}}*/
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1,
            const typename V::AsArg v2, const typename V::AsArg v3)
    {
        const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
        const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
        const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
        const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());

        const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
        const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
        const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
        const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);

        V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
        V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
        V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
        V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
    }/*}}}*/
    template <typename I>  // interleave 5 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data, /*{{{*/
            const I &i, V &v0, V &v1)
    {
        const __m128i a = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[0]]));
        const __m128i b = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[1]]));
        const __m128i c = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[2]]));
        const __m128i d = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[3]]));
        const __m128i e = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[4]]));
        const __m128i f = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[5]]));
        const __m128i g = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[6]]));
        const __m128i h = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int> *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
        v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
        v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
        v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
        v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
        v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
        v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
        const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
        const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
        v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
        v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
        v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
        v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
        const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
        const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
        const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
        const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
        const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
        const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
        const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));

        const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
        const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
        const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
        const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
        const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7

        v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
        v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
        v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
        v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
        v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
        v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
        v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
        v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
    }/*}}}*/
};
template<typename V> struct InterleaveImpl<V, 4, 16> {
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[0]]), tmp0);
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[2]]), tmp1);
    }/*}}}*/
    template <typename I>  // interleave 2 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
    }
    template <typename I>  // interleave 3 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2)
    {
#ifdef Vc_USE_MASKMOV_SCATTER
        const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
        const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
        const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
        _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
        _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
        _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
        _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
#else
        const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
        v2.scatter(data + 2, i);
#endif
    }
    template <typename I>  // interleave 4 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3)
    {
        const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
        const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
        const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
    }
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1)
    {
        const __m128 a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const MayAlias<double> *>(&data[i[0]])));
        const __m128 b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const MayAlias<double> *>(&data[i[1]])));
        const __m128 c = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const MayAlias<double> *>(&data[i[2]])));
        const __m128 d = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const MayAlias<double> *>(&data[i[3]])));

        const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
        const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]

        v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
        v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        const __m128 a = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]]));
        const __m128 b = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[1]]));
        const __m128 c = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[2]]));
        const __m128 d = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[3]]));

        const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
        const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
        const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX]
        const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX]

        v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
        v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
        v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        const __m128 a = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]]));
        const __m128 b = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[1]]));
        const __m128 c = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[2]]));
        const __m128 d = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[3]]));

        const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
        const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
        const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
        const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]

        v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
        v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
        v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
        v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        v4.gather(data + 4, i);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6, v7);
    }/*}}}*/
};
template<typename V> struct InterleaveImpl<V, 2, 16> {
    template <typename I>  // interleave 2 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
        const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
        _mm_storeu_pd(&data[i[0]], tmp0);
        _mm_storeu_pd(&data[i[1]], tmp1);
    }
    template <typename I>  // interleave 3 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2)
    {
        interleave(data, i, v0, v1);
        v2.scatter(data + 2, i);
    }
    template <typename I>  // interleave 4 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3)
    {
        interleave(data, i, v0, v1);
        interleave(data + 2, i, v2, v3);
    }
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1)
    {
        const __m128d a = _mm_loadu_pd(&data[i[0]]);
        const __m128d b = _mm_loadu_pd(&data[i[1]]);

        v0.data() = _mm_unpacklo_pd(a, b);
        v1.data() = _mm_unpackhi_pd(a, b);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        v2.gather(data + 2, i);
        deinterleave(data, i, v0, v1);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        v4.gather(data + 4, i);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
        v6.gather(data + 6, i);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
        deinterleave(data + 6, i, v6, v7);
    }/*}}}*/
};

//}}}1
}  // namespace Detail
}  // namespace Vc

#endif  // VC_SSE_DETAIL_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_count(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L bool is_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L bool is_not_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
}  // namespace Detail

using SSE::sse_cast;

template <typename T> class Mask<T, VectorAbi::Sse>
{
    using abi = VectorAbi::Sse;
    friend class Mask<  double, abi>;
    friend class Mask<   float, abi>;
    friend class Mask< int32_t, abi>;
    friend class Mask<uint32_t, abi>;
    friend class Mask< int16_t, abi>;
    friend class Mask<uint16_t, abi>;

    /**
     * A helper type for aliasing the entries in the mask but behaving like a bool.
     */
    typedef Common::MaskBool<sizeof(T)> MaskBool;

    typedef Common::Storage<T, SSE::VectorTraits<T>::Size> Storage;

public:

    /**
     * The \c EntryType of masks is always bool, independent of \c T.
     */
    typedef bool EntryType;
    using value_type = EntryType;

    /**
     * The return type of the non-const subscript operator.
     */
    using EntryReference = Detail::ElementReference<Mask>;
    using reference = EntryReference;

    /**
     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
     * implementation. This type is useful for the \c sizeof operator in generic functions.
     */
    typedef MaskBool VectorEntryType;

    /**
     * The \c VectorType reveals the implementation-specific internal type used for the SIMD type.
     */
    using VectorType = typename Storage::VectorType;

    /**
     * The associated Vector<T> type.
     */
    using Vector = SSE::Vector<T>;

public:
    Vc_FREE_STORE_OPERATORS_ALIGNED(16);
    static constexpr size_t Size = SSE::VectorTraits<T>::Size;
    static constexpr size_t MemoryAlignment = Size;
    static constexpr std::size_t size() { return Size; }

        // abstracts the way Masks are passed to functions, it can easily be changed to const ref here
#if defined Vc_MSVC && defined _WIN32
        typedef const Mask &Argument;
#else
        typedef Mask Argument;
#endif

        Vc_INTRINSIC Mask() {}
        Vc_INTRINSIC Mask(const __m128  &x) : d(sse_cast<VectorType>(x)) {}
        Vc_INTRINSIC Mask(const __m128d &x) : d(sse_cast<VectorType>(x)) {}
        Vc_INTRINSIC Mask(const __m128i &x) : d(sse_cast<VectorType>(x)) {}
        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : Mask(_mm_setzero_ps()) {}
        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : Mask(SSE::_mm_setallone_ps()) {}
        Vc_INTRINSIC explicit Mask(bool b) : Mask(b ? SSE::_mm_setallone_ps() : _mm_setzero_ps()) {}
        Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
        Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }

        // implicit cast
        template <typename U>
        Vc_INTRINSIC Mask(U &&rhs,
                          Common::enable_if_mask_converts_implicitly<T, U> = nullarg)
            : d(sse_cast<VectorType>(
                  Detail::mask_cast<Traits::simd_vector_size<U>::value, Size, __m128>(
                      rhs.dataI())))
        {
        }

#if Vc_IS_VERSION_1
        // explicit cast, implemented via simd_cast (implementation in sse/simd_cast.h)
        template <typename U>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "mask types") Vc_INTRINSIC
            explicit Mask(U &&rhs,
                          Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
#endif

        Vc_ALWAYS_INLINE explicit Mask(const bool *mem) { load(mem); }
        template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags f) { load(mem, f); }

        Vc_ALWAYS_INLINE_L void load(const bool *mem) Vc_ALWAYS_INLINE_R;
        template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { load(mem); }

        Vc_ALWAYS_INLINE_L void store(bool *) const Vc_ALWAYS_INLINE_R;
        template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { store(mem); }

        Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const
        {
            return Detail::is_equal<Size>(data(), rhs.data());
        }
        Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const
        {
            return Detail::is_not_equal<Size>(data(), rhs.data());
        }

        Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const { return _mm_andnot_si128(dataI(), SSE::_mm_setallone_si128()); }

        Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_and_ps(data(), rhs.data())); return *this; }
        Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_or_ps (data(), rhs.data())); return *this; }
        Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_xor_ps(data(), rhs.data())); return *this; }

        Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &rhs) const { return _mm_and_ps(data(), rhs.data()); }
        Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &rhs) const { return _mm_or_ps (data(), rhs.data()); }
        Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &rhs) const { return _mm_xor_ps(data(), rhs.data()); }

        Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &rhs) const { return _mm_and_ps(data(), rhs.data()); }
        Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &rhs) const { return _mm_or_ps (data(), rhs.data()); }

        Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return
#ifdef Vc_USE_PTEST
            _mm_testc_si128(dataI(), SSE::_mm_setallone_si128()); // return 1 if (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) == (~0 & d.v())
#else
            _mm_movemask_epi8(dataI()) == 0xffff;
#endif
        }
        Vc_ALWAYS_INLINE Vc_PURE bool isNotEmpty() const { return
#ifdef Vc_USE_PTEST
            0 == _mm_testz_si128(dataI(), dataI()); // return 1 if (0, 0, 0, 0) == (d.v() & d.v())
#else
            _mm_movemask_epi8(dataI()) != 0x0000;
#endif
        }
        Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return
#ifdef Vc_USE_PTEST
            0 != _mm_testz_si128(dataI(), dataI()); // return 1 if (0, 0, 0, 0) == (d.v() & d.v())
#else
            _mm_movemask_epi8(dataI()) == 0x0000;
#endif
        }
        Vc_ALWAYS_INLINE Vc_PURE bool isMix() const {
#ifdef Vc_USE_PTEST
            return _mm_test_mix_ones_zeros(dataI(), SSE::_mm_setallone_si128());
#else
            const int tmp = _mm_movemask_epi8(dataI());
            return tmp != 0 && (tmp ^ 0xffff) != 0;
#endif
        }

        Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return _mm_movemask_epi8(dataI()); }

        Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }

        Vc_ALWAYS_INLINE Vc_PURE __m128  data () const { return SSE::sse_cast<__m128 >(d.v()); }
        Vc_ALWAYS_INLINE Vc_PURE __m128i dataI() const { return SSE::sse_cast<__m128i>(d.v()); }
        Vc_ALWAYS_INLINE Vc_PURE __m128d dataD() const { return SSE::sse_cast<__m128d>(d.v()); }

private:
    friend reference;
    static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
    {
        return MaskBool(m.d.m(i));
    }
    template <typename U>
    static Vc_INTRINSIC void set(Mask &m, int i,
                                 U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
    {
        m.d.set(i, MaskBool(std::forward<U>(v)));
    }

public:
    /**
     * \note the returned object models the concept of a reference and
     * as such it can exist longer than the data it is referencing.
     * \note to avoid lifetime issues, we strongly advice not to store
     * any reference objects.
     */
    Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
    {
        return {*this, int(index)};
    }
    Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
    {
        return get(*this, index);
    }

        Vc_ALWAYS_INLINE Vc_PURE int count() const
        {
            return Detail::mask_count<Size>(dataI());
        }

        /**
         * Returns the index of the first one in the mask.
         *
         * The return value is undefined if the mask is empty.
         */
        Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;

        template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;

    private:
#ifdef Vc_COMPILE_BENCHMARKS
    public:
#endif
        Storage d;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::MemoryAlignment;

}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
/*mask_count{{{*/
template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
{
    int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
    return (mask & 1) + (mask >> 1);
}

template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
#else
    auto x = _mm_srli_epi32(k, 31);
    x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
    x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
    return _mm_cvtsi128_si32(x);
#endif
}

template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
{
#ifdef Vc_IMPL_POPCNT
    return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
#else
    auto x = _mm_srli_epi16(k, 15);
    x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
    x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
    x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
    return _mm_extract_epi16(x, 0);
#endif
}

template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
{
    return Detail::popcnt16(_mm_movemask_epi8(k));
}
/*}}}*/
// mask_to_int/*{{{*/
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
{
    return _mm_movemask_pd(_mm_castsi128_pd(k));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
{
    return _mm_movemask_ps(_mm_castsi128_ps(k));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
{
    return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
{
    return _mm_movemask_epi8(k);
}
/*}}}*/
// mask_store/*{{{*/
template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
{
    _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
}
template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
{
    k = _mm_srli_epi16(k, 15);
    const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
#ifdef __x86_64__
    *reinterpret_cast<MayAlias<int64_t> *>(mem) = _mm_cvtsi128_si64(k2);
#else
    _mm_store_sd(reinterpret_cast<MayAlias<double> *>(mem), _mm_castsi128_pd(k2));
#endif
}
template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
{
    *reinterpret_cast<MayAlias<int32_t> *>(mem) = _mm_cvtsi128_si32(
        _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
                        _mm_setzero_si128()));
}
template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
{
    mem[0] = -SseIntrinsics::extract_epi32<1>(k);
    mem[1] = -SseIntrinsics::extract_epi32<3>(k);
}
/*}}}*/
// mask_load/*{{{*/
template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
{
    return sse_cast<__m128>(_mm_cmpgt_epi8(
        _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
{
#ifdef __x86_64__
    __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
#else
    __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
#endif
    return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
{
    __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
    k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
    return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
{
    return sse_cast<__m128>(
        _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
}
/*}}}*/
// is_equal{{{
template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
{
    return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
{
    return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
}

template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
{
    return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
{
    return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
}

template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
{
    return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
           _mm_movemask_epi8(_mm_castps_si128(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
{
    return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
           _mm_movemask_epi8(_mm_castps_si128(k2));
}

template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
{
    return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
           _mm_movemask_epi8(_mm_castps_si128(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
{
    return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
           _mm_movemask_epi8(_mm_castps_si128(k2));
}

// }}}
}  // namespace Detail

template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
{
    typedef uint16_t boolAlias Vc_MAY_ALIAS;
    boolAlias *ptr = reinterpret_cast<boolAlias *>(mem);
    *ptr = _mm_movemask_epi8(dataI()) & 0x0101;
}
template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
{
    Detail::mask_store<Size>(dataI(), mem);
}
template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
{
    d.set(0, MaskBool(mem[0]));
    d.set(1, MaskBool(mem[1]));
}
template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
{
    d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
}

// get / operator[] {{{1
template <>
Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
{
    return m.shiftMask() & (1 << 2 * index);
}
template <>
Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
{
    return m.shiftMask() & (1 << 2 * index);
}

// firstOne {{{1
template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
{
    const int mask = toInt();
#ifdef _MSC_VER
    unsigned long bit;
    _BitScanForward(&bit, mask);
#else
    int bit;
    __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
#endif
    return bit;
}

// generate {{{1
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
{
    return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
                          gen(0) ? 0xffffffffffffffffull : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
{
    return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
                          gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
{
    return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
                          gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
                          gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
                          gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
}
template <typename T>
template <typename G>
Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
{
    return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
                                  std::integral_constant<int, Size>());
}
// shifted {{{1
template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
{
    switch (amount * int(sizeof(VectorEntryType))) {
    case   0: return *this;
    case   1: return Detail::shifted<  1>(dataI());
    case   2: return Detail::shifted<  2>(dataI());
    case   3: return Detail::shifted<  3>(dataI());
    case   4: return Detail::shifted<  4>(dataI());
    case   5: return Detail::shifted<  5>(dataI());
    case   6: return Detail::shifted<  6>(dataI());
    case   7: return Detail::shifted<  7>(dataI());
    case   8: return Detail::shifted<  8>(dataI());
    case   9: return Detail::shifted<  9>(dataI());
    case  10: return Detail::shifted< 10>(dataI());
    case  11: return Detail::shifted< 11>(dataI());
    case  12: return Detail::shifted< 12>(dataI());
    case  13: return Detail::shifted< 13>(dataI());
    case  14: return Detail::shifted< 14>(dataI());
    case  15: return Detail::shifted< 15>(dataI());
    case  16: return Detail::shifted< 16>(dataI());
    case  -1: return Detail::shifted< -1>(dataI());
    case  -2: return Detail::shifted< -2>(dataI());
    case  -3: return Detail::shifted< -3>(dataI());
    case  -4: return Detail::shifted< -4>(dataI());
    case  -5: return Detail::shifted< -5>(dataI());
    case  -6: return Detail::shifted< -6>(dataI());
    case  -7: return Detail::shifted< -7>(dataI());
    case  -8: return Detail::shifted< -8>(dataI());
    case  -9: return Detail::shifted< -9>(dataI());
    case -10: return Detail::shifted<-10>(dataI());
    case -11: return Detail::shifted<-11>(dataI());
    case -12: return Detail::shifted<-12>(dataI());
    case -13: return Detail::shifted<-13>(dataI());
    case -14: return Detail::shifted<-14>(dataI());
    case -15: return Detail::shifted<-15>(dataI());
    case -16: return Detail::shifted<-16>(dataI());
    }
    return Zero();
}
// }}}1

}

// vim: foldmethod=marker

#endif // VC_SSE_MASK_H_
#include <algorithm>
#include <cmath>


#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif

namespace Vc_VERSIONED_NAMESPACE
{

#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Sse>
{
    static_assert(std::is_arithmetic<T>::value,
                  "Vector<T> only accepts arithmetic builtin types as template parameter T.");

    protected:
#ifdef Vc_COMPILE_BENCHMARKS
    public:
#endif
        typedef typename SSE::VectorTraits<T>::StorageType StorageType;
        StorageType d;
        typedef typename SSE::VectorTraits<T>::GatherMaskType GatherMask;
        typedef SSE::VectorHelper<typename SSE::VectorTraits<T>::VectorType> HV;
        typedef SSE::VectorHelper<T> HT;
    public:
        Vc_FREE_STORE_OPERATORS_ALIGNED(16);

        typedef typename SSE::VectorTraits<T>::VectorType VectorType;
        using vector_type = VectorType;
        static constexpr size_t Size = SSE::VectorTraits<T>::Size;
        static constexpr size_t MemoryAlignment = alignof(VectorType);
        typedef typename SSE::VectorTraits<T>::EntryType EntryType;
        using value_type = EntryType;
        using VectorEntryType = EntryType;
        typedef typename std::conditional<(Size >= 4),
                                          SimdArray<int, Size, SSE::int_v, 4>,
                                          SimdArray<int, Size, Scalar::int_v, 1>>::type IndexType;
        typedef typename SSE::VectorTraits<T>::MaskType Mask;
        using MaskType = Mask;
        using mask_type = Mask;
        typedef typename Mask::Argument MaskArg;
        typedef typename Mask::Argument MaskArgument;
        typedef const Vector AsArg;
        using abi = VectorAbi::Sse;
        using WriteMaskedVector = Common::WriteMaskedVector<Vector, Mask>;
        template <typename U> using V = Vector<U, abi>;

        using reference = Detail::ElementReference<Vector>;


        static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;

        ///////////////////////////////////////////////////////////////////////////////////////////
        // internal: required to enable returning objects of VectorType
        Vc_ALWAYS_INLINE Vector(VectorType x) : d(x) {}

        // implict conversion from compatible Vector<U>
        template <typename U>
        Vc_INTRINSIC Vector(
            V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
                                            void *>::type = nullptr)
            : d(SSE::convert<U, T>(x.data()))
        {
        }

#if Vc_IS_VERSION_1
        // static_cast from the remaining Vector<U>
        template <typename U>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "vector types") Vc_INTRINSIC
            explicit Vector(
                V<U> x,
                typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
                                        void *>::type = nullptr)
            : d(SSE::convert<U, T>(x.data()))
        {
        }
#endif

        ///////////////////////////////////////////////////////////////////////////////////////////
        // broadcast
        Vc_INTRINSIC Vector(EntryType a) : d(HT::set(a)) {}
        template <typename U>
        Vc_INTRINSIC Vector(U a,
                            typename std::enable_if<std::is_same<U, int>::value &&
                                                        !std::is_same<U, EntryType>::value,
                                                    void *>::type = nullptr)
            : Vector(static_cast<EntryType>(a))
        {
        }


        ///////////////////////////////////////////////////////////////////////////////////////////
        // zeroing
        Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;

        Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setQnan(const Mask &k) Vc_INTRINSIC_R;


        //prefix
        Vc_INTRINSIC Vector &operator++() { data() = HT::add(data(), HT::one()); return *this; }
        Vc_INTRINSIC Vector &operator--() { data() = HT::sub(data(), HT::one()); return *this; }
        //postfix
        Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = HT::add(data(), HT::one()); return r; }
        Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = HT::sub(data(), HT::one()); return r; }

    private:
        friend reference;
        Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
        {
            return o.d.m(i);
        }
        template <typename U>
        Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
            noexcept(std::declval<value_type &>() = v))
        {
            o.d.set(i, v);
        }

    public:
        /**
         * \note the returned object models the concept of a reference and
         * as such it can exist longer than the data it is referencing.
         * \note to avoid lifetime issues, we strongly advice not to store
         * any reference objects.
         */
        Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
        {
            static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
            return {*this, int(index)};
        }
        Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
        {
            return d.m(index);
        }

        Vc_INTRINSIC_L Vector Vc_VDECL operator[](const SSE::int_v &perm) const Vc_INTRINSIC_R;

        Vc_INTRINSIC Vc_PURE Mask operator!() const
        {
            return *this == Zero();
        }
        Vc_INTRINSIC Vc_PURE Vector operator~() const
        {
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
            static_assert(std::is_integral<T>::value,
                          "bit-complement can only be used with Vectors of integral type");
#endif
            return Detail::andnot_(data(), HV::allone());
        }
        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
        Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }

        Vc_ALWAYS_INLINE Vector  Vc_VDECL operator<< (AsArg shift) const { return generate([&](int i) { return get(*this, i) << get(shift, i); }); }
        Vc_ALWAYS_INLINE Vector  Vc_VDECL operator>> (AsArg shift) const { return generate([&](int i) { return get(*this, i) >> get(shift, i); }); }
        Vc_ALWAYS_INLINE Vector &Vc_VDECL operator<<=(AsArg shift) { return *this = *this << shift; }
        Vc_ALWAYS_INLINE Vector &Vc_VDECL operator>>=(AsArg shift) { return *this = *this >> shift; }

        Vc_INTRINSIC_L Vector &Vc_VDECL operator<<=(  int shift)       Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector  Vc_VDECL operator<< (  int shift) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector &Vc_VDECL operator>>=(  int shift)       Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector  Vc_VDECL operator>> (  int shift) const Vc_INTRINSIC_R;

        Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
            isNegative() const
        {
            return Vc::isnegative(*this);
        }

        Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &mask)
        {
            const VectorType k = SSE::sse_cast<VectorType>(mask.data());
            data() = HV::blend(data(), v.data(), k);
        }

        template <typename V2>
        Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast")
            Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const
        {
            return SSE::convert<T, typename V2::EntryType>(data());
        }
        template <typename V2>
        Vc_DEPRECATED("use reinterpret_components_cast instead")
            Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const
        {
            return SSE::sse_cast<typename V2::VectorType>(data());
        }

        Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return {*this, k}; }

        Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); }
        Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); }

        template<int Index>
        Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;

        Vc_INTRINSIC EntryType min() const { return HT::min(data()); }
        Vc_INTRINSIC EntryType max() const { return HT::max(data()); }
        Vc_INTRINSIC EntryType product() const { return HT::mul(data()); }
        Vc_INTRINSIC EntryType sum() const { return HT::add(data()); }
        Vc_INTRINSIC_L Vector partialSum() const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R;

        Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;

        template <typename F> void callWithValuesSorted(F &&f)
        {
            EntryType value = d.m(0);
            f(value);
            for (std::size_t i = 1; i < Size; ++i) {
                if (d.m(i) != value) {
                    value = d.m(i);
                    f(value);
                }
            }
        }

        template <typename F> Vc_INTRINSIC void call(F &&f) const
        {
            Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
        }

        template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
        {
            for(size_t i : where(mask)) {
                f(EntryType(d.m(i)));
            }
        }

        template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
        {
            Vector r;
            Common::for_all_vector_entries<Size>(
                [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
            return r;
        }
        template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
        {
            Vector r(*this);
            for (size_t i : where(mask)) {
                r.d.set(i, f(EntryType(r.d.m(i))));
            }
            return r;
        }

        template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
        }
        Vc_INTRINSIC void fill(EntryType (&f)()) {
            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
        }

        template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;

        Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
            copySign(AsArg reference) const
        {
            return Vc::copysign(*this, reference);
        }

        Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
        {
            return Vc::exponent(*this);
        }

        Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::MemoryAlignment;

static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v    min(const SSE::int_v    &x, const SSE::int_v    &y) { return SSE::min_epi32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v   min(const SSE::uint_v   &x, const SSE::uint_v   &y) { return SSE::min_epu32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v  min(const SSE::short_v  &x, const SSE::short_v  &y) { return _mm_min_epi16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v  min(const SSE::float_v  &x, const SSE::float_v  &y) { return _mm_min_ps(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v    max(const SSE::int_v    &x, const SSE::int_v    &y) { return SSE::max_epi32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v   max(const SSE::uint_v   &x, const SSE::uint_v   &y) { return SSE::max_epu32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v  max(const SSE::short_v  &x, const SSE::short_v  &y) { return _mm_max_epi16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v  max(const SSE::float_v  &x, const SSE::float_v  &y) { return _mm_max_ps(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); }

template <typename T,
          typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
                               std::is_same<T, short>::value ||
                               std::is_same<T, int>::value>>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> abs(Vector<T, VectorAbi::Sse> x)
{
    return SSE::VectorHelper<T>::abs(x.data());
}

  template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> sqrt (const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::sqrt(x.data()); }
  template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> rsqrt(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::rsqrt(x.data()); }
  template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> reciprocal(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::reciprocal(x.data()); }
  template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> round(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::round(x.data()); }

  template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isfinite(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isFinite(x.data()); }
  template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isinf(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isInfinite(x.data()); }
  template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isnan(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isNaN(x.data()); }

#define Vc_CONDITIONAL_ASSIGN(name_, op_)                                                \
    template <Operator O, typename T, typename M, typename U>                            \
    Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign(               \
        Vector<T, VectorAbi::Sse> &lhs, M &&mask, U &&rhs)                               \
    {                                                                                    \
        lhs(mask) op_ rhs;                                                               \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(          Assign,  =);
Vc_CONDITIONAL_ASSIGN(      PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN(     MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN(  MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN(    DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN(       XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN(       AndAssign, &=);
Vc_CONDITIONAL_ASSIGN(        OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN

#define Vc_CONDITIONAL_ASSIGN(name_, expr_)                                              \
    template <Operator O, typename T, typename M>                                        \
    Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Sse>>              \
    conditional_assign(Vector<T, VectorAbi::Sse> &lhs, M &&mask)                         \
    {                                                                                    \
        return expr_;                                                                    \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN

}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_X86_PREFETCHES_H_
#define VC_COMMON_X86_PREFETCHES_H_

#include <xmmintrin.h>

namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

#if defined(Vc_IMPL_MIC)
static constexpr int exclusive_hint = 0x4;
#else
static constexpr int exclusive_hint = 0;
#endif

// TODO: support AMD's prefetchw with correct flags and checks via cpuid

template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchForOneRead(const void *addr)
{
    if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
    } else {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
                     static_cast<decltype(_MM_HINT_NTA)>(_MM_HINT_NTA | exclusive_hint));
    }
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchClose(const void *addr)
{
    if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
    } else {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
                     static_cast<decltype(_MM_HINT_T0)>(_MM_HINT_T0 | exclusive_hint));
    }
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchMid(const void *addr)
{
    if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
    } else {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
                     static_cast<decltype(_MM_HINT_T1)>(_MM_HINT_T1 | exclusive_hint));
    }
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchFar(const void *addr)
{
    if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
    } else {
        _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
                     static_cast<decltype(_MM_HINT_T2)>(_MM_HINT_T2 | exclusive_hint));
    }
}

/*handlePrefetch/handleLoadPrefetches/handleStorePrefetches{{{*/
namespace
{
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 != 0, void *>::type = nullptr)
{
    const char *addr = static_cast<const char *>(addr_);
    prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
    prefetchMid  <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 == 0 && L2 != 0, void *>::type = nullptr)
{
    const char *addr = static_cast<const char *>(addr_);
    prefetchMid  <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 == 0, void *>::type = nullptr)
{
    const char *addr = static_cast<const char *>(addr_);
    prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *, typename std::enable_if<L1 == 0 && L2 == 0, void *>::type = nullptr)
{
}

template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void *    , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch    = nullptr)
{
    // load prefetches default to Shared unless Exclusive was explicitely selected
    handlePrefetch<Flags::L1Stride, Flags::L2Stride, Flags::IsExclusivePrefetch>(addr);
}

template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void *    , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch    = nullptr)
{
    // store prefetches default to Exclusive unless Shared was explicitely selected
    handlePrefetch<Flags::L1Stride, Flags::L2Stride, !Flags::IsSharedPrefetch>(addr);
}

} // anonymous namespace
/*}}}*/

}  // namespace Common

using Common::prefetchForOneRead;
using Common::prefetchClose;
using Common::prefetchMid;
using Common::prefetchFar;
}  // namespace Vc

#endif // VC_COMMON_X86_PREFETCHES_H_

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_LIMITS_H_
#define VC_SSE_LIMITS_H_


namespace std
{
template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits<unsigned short>
{
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max()           Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min()           Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest()        Vc_NOEXCEPT { return min(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon()       Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error()   Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity()      Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN()     Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min()    Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits<short>
{
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max()           Vc_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min()           Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi16(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest()        Vc_NOEXCEPT { return min(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon()       Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error()   Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity()      Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN()     Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min()    Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits<unsigned int>
{
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max()           Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min()           Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest()        Vc_NOEXCEPT { return min(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon()       Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error()   Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity()      Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN()     Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min()    Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits<int>
{
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max()           Vc_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min()           Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi32(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest()        Vc_NOEXCEPT { return min(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon()       Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error()   Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity()      Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN()     Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
    static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min()    Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
};
} // namespace std

#endif // VC_SSE_LIMITS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_BITSCANINTRINSICS_H_
#define VC_COMMON_BITSCANINTRINSICS_H_

#if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
#  if Vc_GCC >= 0x40500
     // GCC 4.5.0 introduced _bit_scan_forward / _bit_scan_reverse
#    include <x86intrin.h>
#  else
     // GCC <= 4.4 and clang have x86intrin.h, but not the required functions
#    define _bit_scan_forward(x) __builtin_ctz(x)
static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
    int r;
    __asm__("bsr %1,%0" : "=r"(r) : "X"(x));
    return r;
}
#    define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
#  endif
#elif defined(_WIN32)
#include <intrin.h>
static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
	unsigned long index;
	_BitScanForward(&index, x);
	return index;
}
static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
	unsigned long index;
	_BitScanReverse(&index, x);
	return index;
}
#elif defined(Vc_ICC)
// for all I know ICC supports the _bit_scan_* intrinsics
#else
// just assume the compiler can do it
#endif


#endif // VC_COMMON_BITSCANINTRINSICS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_SET_H_
#define VC_COMMON_SET_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
    static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
            unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
    {
#if defined(Vc_GNU_ASM)
#if 0 // defined(__x86_64__)
        // it appears that the 32bit variant is always faster
        __m128i r;
        unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
        unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
        asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
        unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
        unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
        asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
        return r;
#elif defined(Vc_USE_VEX_CODING)
        __m128i r0, r1;
        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
        asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
        asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
        asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
        asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
        asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
        return r0;
#else
        __m128i r0, r1;
        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
        asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
        asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
        asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
        asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
        asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
        return r0;
#endif
#else
        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
        return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
#endif
    }
    static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
    {
        return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
                static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
                static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
    }
}  // anonymous namespace
}  // namespace Vc

#endif // VC_COMMON_SET_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
#define VC_COMMON_GATHERIMPLEMENTATION_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

enum class GatherScatterImplementation : int {
    SimpleLoop,
    SetIndexZero,
    BitScanLoop,
    PopcntSwitch
};

using SimpleLoopT   = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
using BitScanLoopT  = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
                                    V &v,
                                    const MT *mem,
                                    IT &&indexes_,
                                    typename V::MaskArgument mask)
{
    auto indexes = std::forward<IT>(indexes_);
    indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
    const V tmp(mem, indexes);
    where(mask) | v = tmp;
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SimpleLoopT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask)
{
    if (Vc_IS_UNLIKELY(mask.isEmpty())) {
        return;
    }
    Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
        if (mask[i])
            v[i] = mem[indexes[i]];
    });
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask)
{
#ifdef Vc_GNU_ASM
    size_t bits = mask.toInt();
    while (Vc_IS_LIKELY(bits > 0)) {
        size_t i, j;
        asm("bsf %[bits],%[i]\n\t"
            "bsr %[bits],%[j]\n\t"
            "btr %[i],%[bits]\n\t"
            "btr %[j],%[bits]\n\t"
            : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
        v[i] = mem[indexes[i]];
        v[j] = mem[indexes[j]];
    }
#else
    // Alternative from Vc::SSE (0.7)
    int bits = mask.toInt();
    while (bits) {
        const int i = _bit_scan_forward(bits);
	bits &= bits - 1;
	v[i] = mem[indexes[i]];
    }
#endif  // Vc_GNU_ASM
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 16> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt16(bits)) {
    case 16:
        v.gather(mem, indexes);
        break;
    case 15:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        v[low] = mem[indexes[low]];
    case 14:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 13:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 12:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 11:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 10:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 9:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 8:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 7:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 6:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 5:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 4:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 2:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
    case 1:
        low = _bit_scan_forward(bits);
        v[low] = mem[indexes[low]];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 8> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt8(bits)) {
    case 8:
        v.gather(mem, indexes);
        break;
    case 7:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        v[low] = mem[indexes[low]];
    case 6:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 5:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 4:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
        high = (1 << high);
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        v[low] = mem[indexes[low]];
    case 2:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
    case 1:
        low = _bit_scan_forward(bits);
        v[low] = mem[indexes[low]];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 4> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt4(bits)) {
    case 4:
        v.gather(mem, indexes);
        break;
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        v[low] = mem[indexes[low]];
    case 2:
        high = _bit_scan_reverse(bits);
        v[high] = mem[indexes[high]];
    case 1:
        low = _bit_scan_forward(bits);
        v[low] = mem[indexes[low]];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
                                    V &v,
                                    const MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 2> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low;
    switch (Vc::Detail::popcnt4(bits)) {
    case 2:
        v.gather(mem, indexes);
        break;
    case 1:
        low = _bit_scan_forward(bits);
        v[low] = mem[indexes[low]];
    case 0:
        break;
    }
}

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_GATHERIMPLEMENTATION_H_
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
#define VC_COMMON_SCATTERIMPLEMENTATION_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
                                    V &v,
                                    MT *mem,
                                    IT indexes,
                                    typename V::MaskArgument mask)
{
    indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
    // Huh?
    const V tmp(mem, indexes);
    where(mask) | v = tmp;
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask)
{
    if (Vc_IS_UNLIKELY(mask.isEmpty())) {
        return;
    }
    Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
        if (mask[i])
            mem[indexes[i]] = v[i];
    });
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask)
{
    size_t bits = mask.toInt();
    while (Vc_IS_LIKELY(bits > 0)) {
        size_t i, j;
        asm("bsf %[bits],%[i]\n\t"
            "bsr %[bits],%[j]\n\t"
            "btr %[i],%[bits]\n\t"
            "btr %[j],%[bits]\n\t"
            : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
        mem[indexes[i]] = v[i];
        mem[indexes[j]] = v[j];
    }

    /* Alternative from Vc::SSE (0.7)
    int bits = mask.toInt();
    while (bits) {
        const int i = _bit_scan_forward(bits);
        bits ^= (1 << i); // btr?
        mem[indexes[i]] = v[i];
    }
    */
}

template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 16> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt16(bits)) {
    case 16:
        v.scatter(mem, indexes);
        break;
    case 15:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        mem[indexes[low]] = v[low];
    case 14:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 13:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 12:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 11:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 10:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 9:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 8:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 7:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 6:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 5:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 4:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 2:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
    case 1:
        low = _bit_scan_forward(bits);
        mem[indexes[low]] = v[low];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 8> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt8(bits)) {
    case 8:
        v.scatter(mem, indexes);
        break;
    case 7:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        mem[indexes[low]] = v[low];
    case 6:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 5:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 4:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
        high = (1 << high);
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= high | (1 << low);
        mem[indexes[low]] = v[low];
    case 2:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
    case 1:
        low = _bit_scan_forward(bits);
        mem[indexes[low]] = v[low];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 4> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low, high = 0;
    switch (Vc::Detail::popcnt4(bits)) {
    case 4:
        v.scatter(mem, indexes);
        break;
    case 3:
        low = _bit_scan_forward(bits);
        bits ^= 1 << low;
        mem[indexes[low]] = v[low];
    case 2:
        high = _bit_scan_reverse(bits);
        mem[indexes[high]] = v[high];
    case 1:
        low = _bit_scan_forward(bits);
        mem[indexes[low]] = v[low];
    case 0:
        break;
    }
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
                                    V &v,
                                    MT *mem,
                                    const IT &indexes,
                                    typename V::MaskArgument mask,
                                    enable_if<V::Size == 2> = nullarg)
{
    unsigned int bits = mask.toInt();
    unsigned int low;
    switch (Vc::Detail::popcnt4(bits)) {
    case 2:
        v.scatter(mem, indexes);
        break;
    case 1:
        low = _bit_scan_forward(bits);
        mem[indexes[low]] = v[low];
    case 0:
        break;
    }
}

}  // namespace Common
}  // namespace Vc

#endif // VC_COMMON_SCATTERIMPLEMENTATION_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// compare operators {{{1
Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator==(SSE::   int_v a, SSE::   int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE::  uint_m operator==(SSE::  uint_v a, SSE::  uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }

Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator!=(SSE::   int_v a, SSE::   int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC SSE::  uint_m operator!=(SSE::  uint_v a, SSE::  uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }

Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator> (SSE::   int_v a, SSE::   int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE::  uint_m operator> (SSE::  uint_v a, SSE::  uint_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
    return SSE::cmpgt_epu32(a.data(), b.data());
#else
    return _mm_cmpgt_epi32(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
    return SSE::cmpgt_epu16(a.data(), b.data());
#else
    return _mm_cmpgt_epi16(a.data(), b.data());
#endif
}

Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator< (SSE::   int_v a, SSE::   int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE::  uint_m operator< (SSE::  uint_v a, SSE::  uint_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
    return SSE::cmplt_epu32(a.data(), b.data());
#else
    return _mm_cmplt_epi32(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
    return SSE::cmplt_epu16(a.data(), b.data());
#else
    return _mm_cmplt_epi16(a.data(), b.data());
#endif
}

Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator>=(SSE::   int_v a, SSE::   int_v b) { return !(a < b); }
Vc_INTRINSIC SSE::  uint_m operator>=(SSE::  uint_v a, SSE::  uint_v b) { return !(a < b); }
Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }

Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE::   int_m operator<=(SSE::   int_v a, SSE::   int_v b) { return !(a > b); }
Vc_INTRINSIC SSE::  uint_m operator<=(SSE::  uint_v a, SSE::  uint_v b) { return !(a > b); }
Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }

// bitwise operators {{{1
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return xor_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return and_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return or_(a.data(), b.data());
}
// arithmetic operators {{{1
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return add(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return sub(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return mul(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
    SSE::Vector<T> a, SSE::Vector<T> b)
{
    return div(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC
    enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
    operator/(SSE::Vector<T> a, SSE::Vector<T> b)
{
    return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
                       SSE::Vector<T>>
operator/(SSE::Vector<T> a, SSE::Vector<T> b)
{
    using HT = SSE::VectorHelper<T>;
    __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
    __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
    lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
    hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
    return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
    SSE::Vector<T> a, SSE::Vector<T> b)
{
    return a - a / b * b;
}
// }}}1
}  // namespace Detail
// constants {{{1
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
    : d(HV::zero())
{
}

template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
    : d(HT::one())
{
}

template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
    : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
{
#if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
    // GCC 4.9.2 (at least) miscompiles SSE::short_v::IndexesFromZero() if used implicitly
    // from SimdArray<short, 9> compiling for AVX2 to vpmovsxwd (sign extending load from
    // a 8x 16-bit constant to 8x 32-bit register)
    if (std::is_same<T, short>::value) {
        asm("" ::"x"(d.v()));
    }
#endif
}

template <>
Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
    : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
{
}

template <>
Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
    : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
{
}

// load member functions {{{1
template <typename DstT>
template <typename SrcT, typename Flags>
Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
#ifndef Vc_MSVC
template
#endif
load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
{
    Common::handleLoadPrefetches(mem, flags);
    d.v() = Detail::load<VectorType, DstT>(mem, flags);
}

// zeroing {{{1
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
{
    data() = HV::zero();
}

template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
{
    data() = Detail::andnot_(SSE::sse_cast<VectorType>(k.data()), data());
}

template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
{
    data() = Detail::and_(SSE::sse_cast<VectorType>(k.data()), data());
}

template<> Vc_INTRINSIC void SSE::double_v::setQnan()
{
    data() = SSE::_mm_setallone_pd();
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
{
    data() = _mm_or_pd(data(), k.dataD());
}
template<> Vc_INTRINSIC void SSE::float_v::setQnan()
{
    data() = SSE::_mm_setallone_ps();
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
{
    data() = _mm_or_ps(data(), k.data());
}

///////////////////////////////////////////////////////////////////////////////////////////
// stores {{{1
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
{
    Common::handleStorePrefetches(mem, flags);
    HV::template store<Flags>(mem, data());
}

template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
{
    Common::handleStorePrefetches(mem, flags);
    HV::template store<Flags>(mem, data(), sse_cast<VectorType>(mask.data()));
}

///////////////////////////////////////////////////////////////////////////////////////////
// operator- {{{1
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
{
    return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
}
///////////////////////////////////////////////////////////////////////////////////////////
// integer ops {{{1
#ifdef Vc_IMPL_XOP
template <> Vc_ALWAYS_INLINE    SSE::int_v    SSE::int_v::operator<<(const    SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE   SSE::uint_v   SSE::uint_v::operator<<(const   SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE  SSE::short_v  SSE::short_v::operator<<(const  SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE    SSE::int_v    SSE::int_v::operator>>(const    SSE::int_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE   SSE::uint_v   SSE::uint_v::operator>>(const   SSE::uint_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE  SSE::short_v  SSE::short_v::operator>>(const  SSE::short_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
#elif defined Vc_IMPL_AVX2
template <> Vc_ALWAYS_INLINE SSE::Vector<   int> Vector<   int, VectorAbi::Sse>::operator<<(const SSE::Vector<   int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector<  uint> Vector<  uint, VectorAbi::Sse>::operator<<(const SSE::Vector<  uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector<   int> Vector<   int, VectorAbi::Sse>::operator>>(const SSE::Vector<   int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector<  uint> Vector<  uint, VectorAbi::Sse>::operator>>(const SSE::Vector<  uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
#endif

template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
    d.v() = HT::shiftRight(d.v(), shift);
    return *this;
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
    return HT::shiftRight(d.v(), shift);
}
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
    d.v() = HT::shiftLeft(d.v(), shift);
    return *this;
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
    return HT::shiftLeft(d.v(), shift);
}

///////////////////////////////////////////////////////////////////////////////////////////
// isnegative {{{1
Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
{
    return sse_cast<__m128>(_mm_srai_epi32(
        sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
}
Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
{
    return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
        sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
}

// gathers {{{1
template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::double_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]);
}

template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::float_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
}

template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::int_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
}

template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::uint_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
}

template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::short_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = Vc::set(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
                    mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
}

template <>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void SSE::ushort_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = Vc::set(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
                    mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
}

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Sse>::gatherImplementation(const MT *mem,
                                                            const IT &indexes,
                                                            MaskArgument mask)
{
    using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
                                            Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
              Common::GatherScatterImplementation::PopcntSwitch
#else
              Common::GatherScatterImplementation::SimpleLoop
#endif
                                                > ;
    Common::executeGather(Selector(), *this, mem, indexes, mask);
}

// scatters {{{1
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
{
    Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
}

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
{
    using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
                                            Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
              Common::GatherScatterImplementation::PopcntSwitch
#else
              Common::GatherScatterImplementation::SimpleLoop
#endif
                                                > ;
    Common::executeScatter(Selector(), *this, mem, indexes, mask);
}

///////////////////////////////////////////////////////////////////////////////////////////
// horizontal ops {{{1
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
{
    //   a    b    c    d    e    f    g    h
    // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
    // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
    // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
    Vector<T, VectorAbi::Sse> tmp = *this;
    if (Size >  1) tmp += tmp.shifted(-1);
    if (Size >  2) tmp += tmp.shifted(-2);
    if (Size >  4) tmp += tmp.shifted(-4);
    if (Size >  8) tmp += tmp.shifted(-8);
    if (Size > 16) tmp += tmp.shifted(-16);
    return tmp;
}
#ifndef Vc_IMPL_SSE4_1
// without SSE4.1 integer multiplication is slow and we rather multiply the scalars
template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
{
    return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
}
template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
{
    return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
}
#endif
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
{
    Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
    tmp(m) = *this;
    return tmp.min();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
{
    Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
    tmp(m) = *this;
    return tmp.max();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
{
    Vector<T, VectorAbi::Sse> tmp(Vc::One);
    tmp(m) = *this;
    return tmp.product();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
{
    Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
    tmp(m) = *this;
    return tmp.sum();
}

///////////////////////////////////////////////////////////////////////////////////////////
// exponent {{{1
namespace Detail
{
Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
{
    __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
    tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
    return _mm_cvtepi32_ps(tmp);
}
Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
{
    __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
    tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
    return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
}
} // namespace Detail

Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
{
    using Detail::operator>=;
    Vc_ASSERT((x >= x.Zero()).isFull());
    return Detail::exponent(x.data());
}
Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
{
    using Detail::operator>=;
    Vc_ASSERT((x >= x.Zero()).isFull());
    return Detail::exponent(x.data());
}
// }}}1
// Random {{{1
static void _doRandomStep(SSE::uint_v &state0,
        SSE::uint_v &state1)
{
    using SSE::uint_v;
    using Detail::operator+;
    using Detail::operator*;
    state0.load(&Common::RandomState[0]);
    state1.load(&Common::RandomState[uint_v::Size]);
    (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
    uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
                         _mm_srli_epi32(state1.data(), 16)))
        .store(&Common::RandomState[0]);
}

template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
{
    SSE::uint_v state0, state1;
    _doRandomStep(state0, state1);
    return state0.data();
}

template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
{
    SSE::uint_v state0, state1;
    _doRandomStep(state0, state1);
    return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
}

template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
{
    typedef unsigned long long uint64 Vc_MAY_ALIAS;
    uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
    uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
    const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
    *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
    *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
    return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
}
// shifted / rotated {{{1
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
{
    enum {
        EntryTypeSizeof = sizeof(EntryType)
    };
    switch (amount) {
    case  0: return *this;
    case  1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
    case  2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
    case  3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
    case  4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
    case  5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
    case  6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
    case  7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
    case  8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
    case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
    case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
    case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
    case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
    case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
    case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
    case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
    case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
    }
    return Zero();
}
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
{
    if (amount >= -int(size())) {
        constexpr int VectorWidth = int(size());
        constexpr int EntryTypeSizeof = sizeof(EntryType);
        const __m128i v0 = sse_cast<__m128i>(d.v());
        const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
        auto &&fixup = sse_cast<VectorType, __m128i>;
        switch (amount) {
        case  0: return *this;
                 // alignr_epi8: [arg1 arg0] << n
        case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
        case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
        case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
        case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
        case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
        case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
        case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
        case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
        case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
        case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
        case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
        case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
        case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
        case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
        case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
        case  1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
        case  2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
        case  3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
        case  4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
        case  5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
        case  6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
        case  7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
        case  8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
        case  9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
        case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
        case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
        case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
        case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
        case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
        case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
        }
    }
    return shiftIn.shifted(int(size()) + amount);
}
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
{
    enum {
        EntryTypeSizeof = sizeof(EntryType)
    };
    const __m128i v = SSE::sse_cast<__m128i>(d.v());
    switch (static_cast<unsigned int>(amount) % Size) {
    case  0: return *this;
    case  1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
    case  2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
    case  3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
             // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
             // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
             // disables the following four calls unless sizeof(EntryType) == 2.
    case  4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
    case  5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
    case  6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
    case  7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
    }
    return Zero();
}
// sorted {{{1
namespace Detail
{
inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
{
    const __m128d x = x_.data();
    const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
    return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
}
}  // namespace Detail
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
    const
{
    return Detail::sorted(*this);
}
// interleaveLow/-High {{{1
template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
template <> Vc_INTRINSIC  SSE::float_v  SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
template <> Vc_INTRINSIC  SSE::float_v  SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
template <> Vc_INTRINSIC    SSE::int_v    SSE::int_v::interleaveLow (   SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
template <> Vc_INTRINSIC    SSE::int_v    SSE::int_v::interleaveHigh(   SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
template <> Vc_INTRINSIC   SSE::uint_v   SSE::uint_v::interleaveLow (  SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
template <> Vc_INTRINSIC   SSE::uint_v   SSE::uint_v::interleaveHigh(  SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
template <> Vc_INTRINSIC  SSE::short_v  SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
template <> Vc_INTRINSIC  SSE::short_v  SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
// }}}1
// generate {{{1
template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    return _mm_setr_pd(tmp0, tmp1);
}
template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
// }}}1
// reversed {{{1
template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
{
    return Mem::permute<X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
{
    return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
{
    return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
{
    return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
{
    return sse_cast<__m128i>(
        Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
                             sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
}
template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
{
    return sse_cast<__m128i>(
        Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
                             sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
}
// }}}1
// permutation via operator[] {{{1
template <>
Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
#ifdef Vc_IMPL_AVX
                                             perm
#endif
                                         ) const
{
    /*
    const int_m cross128 = concat(_mm_cmpgt_epi32(lo128(perm.data()), _mm_set1_epi32(3)),
                                  _mm_cmplt_epi32(hi128(perm.data()), _mm_set1_epi32(4)));
    if (cross128.isNotEmpty()) {
    SSE::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
        x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
        return x;
    } else {
    */
#ifdef Vc_IMPL_AVX
    return _mm_permutevar_ps(d.v(), perm.data());
#else
    return *this;//TODO
#endif
}
// broadcast from constexpr index {{{1
template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
{
    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
    return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
}
template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
{
    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
    return Mem::permute<Inner, Inner>(d.v());
}
// }}}1

namespace Common
{
// transpose_impl {{{1
Vc_ALWAYS_INLINE void transpose_impl(
    TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
    const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
{
    const auto in0 = std::get<0>(proxy.in).data();
    const auto in1 = std::get<1>(proxy.in).data();
    const auto in2 = std::get<2>(proxy.in).data();
    const auto in3 = std::get<3>(proxy.in).data();
    const auto tmp0 = _mm_unpacklo_ps(in0, in2);
    const auto tmp1 = _mm_unpacklo_ps(in1, in3);
    const auto tmp2 = _mm_unpackhi_ps(in0, in2);
    const auto tmp3 = _mm_unpackhi_ps(in1, in3);
    *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
    *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
    *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
    *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
}
// }}}1
}  // namespace Common
}

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_SSE_SIMD_CAST_H_
#define VC_SSE_SIMD_CAST_H_

#ifdef Vc_IMPL_AVX
#endif

#ifndef VC_SSE_VECTOR_H_
#error "Vc/sse/vector.h needs to be included before Vc/sse/simd_cast.h"
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{

// Declarations: helper macros Vc_SIMD_CAST_[1248] {{{1
#define Vc_SIMD_CAST_1(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_2(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_4(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3,                                          \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_8(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7,  \
        enable_if<std::is_same<To, to_>::value> = nullarg)

// Declarations: Vector casts without offset {{{1
// 1 SSE::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_1( float_v,    int_v);
Vc_SIMD_CAST_1(double_v,    int_v);
Vc_SIMD_CAST_1(  uint_v,    int_v);
Vc_SIMD_CAST_1( short_v,    int_v);
Vc_SIMD_CAST_1(ushort_v,    int_v);
Vc_SIMD_CAST_1( float_v,   uint_v);
Vc_SIMD_CAST_1(double_v,   uint_v);
Vc_SIMD_CAST_1(   int_v,   uint_v);
Vc_SIMD_CAST_1( short_v,   uint_v);
Vc_SIMD_CAST_1(ushort_v,   uint_v);
Vc_SIMD_CAST_1(double_v,  float_v);
Vc_SIMD_CAST_1(   int_v,  float_v);
Vc_SIMD_CAST_1(  uint_v,  float_v);
Vc_SIMD_CAST_1( short_v,  float_v);
Vc_SIMD_CAST_1(ushort_v,  float_v);
Vc_SIMD_CAST_1( float_v, double_v);
Vc_SIMD_CAST_1(   int_v, double_v);
Vc_SIMD_CAST_1(  uint_v, double_v);
Vc_SIMD_CAST_1( short_v, double_v);
Vc_SIMD_CAST_1(ushort_v, double_v);
Vc_SIMD_CAST_1(   int_v,  short_v);
Vc_SIMD_CAST_1(  uint_v,  short_v);
Vc_SIMD_CAST_1( float_v,  short_v);
Vc_SIMD_CAST_1(double_v,  short_v);
Vc_SIMD_CAST_1(ushort_v,  short_v);
Vc_SIMD_CAST_1(   int_v, ushort_v);
Vc_SIMD_CAST_1(  uint_v, ushort_v);
Vc_SIMD_CAST_1( float_v, ushort_v);
Vc_SIMD_CAST_1(double_v, ushort_v);
Vc_SIMD_CAST_1( short_v, ushort_v);

// 2 SSE::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_2(double_v,    int_v);
Vc_SIMD_CAST_2(double_v,   uint_v);
Vc_SIMD_CAST_2(double_v,  float_v);
Vc_SIMD_CAST_2(   int_v,  short_v);
Vc_SIMD_CAST_2(  uint_v,  short_v);
Vc_SIMD_CAST_2( float_v,  short_v);
Vc_SIMD_CAST_2(double_v,  short_v);
Vc_SIMD_CAST_2(   int_v, ushort_v);
Vc_SIMD_CAST_2(  uint_v, ushort_v);
Vc_SIMD_CAST_2( float_v, ushort_v);
Vc_SIMD_CAST_2(double_v, ushort_v);

// 3 SSE::Vector to 1 SSE::Vector {{{2
#define Vc_CAST_(To_)                                                                    \
    template <typename Return>                                                           \
    Vc_INTRINSIC Vc_CONST enable_if<std::is_same<Return, To_>::value, Return>
Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c);
Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c);

// 4 SSE::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_4(double_v,  short_v);
Vc_SIMD_CAST_4(double_v, ushort_v);
//}}}2
}  // namespace SSE
using SSE::simd_cast;

// 1 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 2 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 3 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 4 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 5 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 6 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 7 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// 8 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, Scalar::Vector<T> x7,
          enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, Scalar::Vector<T> x7,
          enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);

// SSE::Vector to Scalar::Vector {{{2
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> = nullarg);

// helper macros Vc_SIMD_CAST_[1248] {{{1
#undef Vc_SIMD_CAST_1
#undef Vc_SIMD_CAST_2
#undef Vc_SIMD_CAST_4
#undef Vc_SIMD_CAST_8
#define Vc_SIMD_CAST_1(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)

#define Vc_SIMD_CAST_2(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1,                               \
                                       enable_if<std::is_same<To, to_>::value>)

#define Vc_SIMD_CAST_4(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3,           \
                                       enable_if<std::is_same<To, to_>::value>)

#define Vc_SIMD_CAST_8(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
                                       from_ x5, from_ x6, from_ x7,                     \
                                       enable_if<std::is_same<To, to_>::value>)

// Vector casts without offset {{{1
namespace SSE
{
// helper functions {{{2
Vc_INTRINSIC __m128i convert_int32_to_int16(__m128i a, __m128i b)
{
    auto tmp0 = _mm_unpacklo_epi16(a, b);        // 0 4 X X 1 5 X X
    auto tmp1 = _mm_unpackhi_epi16(a, b);        // 2 6 X X 3 7 X X
    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);  // 0 2 4 6 X X X X
    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);  // 1 3 5 7 X X X X
    return _mm_unpacklo_epi16(tmp2, tmp3);       // 0 1 2 3 4 5 6 7
}

// 1 SSE::Vector to 1 SSE::Vector {{{2
// to int_v {{{3
Vc_SIMD_CAST_1( float_v,    int_v) { return convert< float, int>(x.data()); }
Vc_SIMD_CAST_1(double_v,    int_v) { return convert<double, int>(x.data()); }
Vc_SIMD_CAST_1(  uint_v,    int_v) { return convert<  uint, int>(x.data()); }
Vc_SIMD_CAST_1( short_v,    int_v) { return convert< short, int>(x.data()); }
Vc_SIMD_CAST_1(ushort_v,    int_v) { return convert<ushort, int>(x.data()); }
// to uint_v {{{3
Vc_SIMD_CAST_1( float_v,   uint_v) { return convert< float, uint>(x.data()); }
Vc_SIMD_CAST_1(double_v,   uint_v) { return convert<double, uint>(x.data()); }
Vc_SIMD_CAST_1(   int_v,   uint_v) { return convert<   int, uint>(x.data()); }
Vc_SIMD_CAST_1( short_v,   uint_v) { return convert< short, uint>(x.data()); }
Vc_SIMD_CAST_1(ushort_v,   uint_v) { return convert<ushort, uint>(x.data()); }
// to float_v {{{3
Vc_SIMD_CAST_1(double_v,  float_v) { return convert<double, float>(x.data()); }
Vc_SIMD_CAST_1(   int_v,  float_v) { return convert<   int, float>(x.data()); }
Vc_SIMD_CAST_1(  uint_v,  float_v) { return convert<  uint, float>(x.data()); }
Vc_SIMD_CAST_1( short_v,  float_v) { return convert< short, float>(x.data()); }
Vc_SIMD_CAST_1(ushort_v,  float_v) { return convert<ushort, float>(x.data()); }
// to double_v {{{3
Vc_SIMD_CAST_1( float_v, double_v) { return convert< float, double>(x.data()); }
Vc_SIMD_CAST_1(   int_v, double_v) { return convert<   int, double>(x.data()); }
Vc_SIMD_CAST_1(  uint_v, double_v) { return convert<  uint, double>(x.data()); }
Vc_SIMD_CAST_1( short_v, double_v) { return convert< short, double>(x.data()); }
Vc_SIMD_CAST_1(ushort_v, double_v) { return convert<ushort, double>(x.data()); }
// to short_v {{{3
/*
 * §4.7 p3 (integral conversions)
 *  If the destination type is signed, the value is unchanged if it can be represented in the
 *  destination type (and bit-field width); otherwise, the value is implementation-defined.
 *
 * See also below for the Vc_SIMD_CAST_2
 *
 * the alternative, which is probably incorrect for all compilers out there:
    Vc_SIMD_CAST_1(   int_v,  short_v) { return _mm_packs_epi32(x.data(), _mm_setzero_si128()); }
    Vc_SIMD_CAST_1(  uint_v,  short_v) { return _mm_packs_epi32(x.data(), _mm_setzero_si128()); }
    Vc_SIMD_CAST_2(   int_v,  short_v) { return _mm_packs_epi32(x0.data(), x1.data()); }
    Vc_SIMD_CAST_2(  uint_v,  short_v) { return _mm_packs_epi32(x0.data(), x1.data()); }
 */
Vc_SIMD_CAST_1(   int_v,  short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(  uint_v,  short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( float_v,  short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(double_v,  short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(ushort_v,  short_v) { return x.data(); }
// to ushort_v {{{3
Vc_SIMD_CAST_1(   int_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(  uint_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1( short_v, ushort_v) { return x.data(); }
// 2 SSE::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_2(double_v,    int_v) {
#ifdef Vc_IMPL_AVX
    return AVX::convert<double, int>(AVX::concat(x0.data(), x1.data()));
#else
    return _mm_unpacklo_epi64(convert<double, int>(x0.data()), convert<double, int>(x1.data()));
#endif
}
Vc_SIMD_CAST_2(double_v,   uint_v) {
#ifdef Vc_IMPL_AVX
    return AVX::convert<double, uint>(AVX::concat(x0.data(), x1.data()));
#else
    return _mm_unpacklo_epi64(convert<double, uint>(x0.data()), convert<double, uint>(x1.data()));
#endif
}
Vc_SIMD_CAST_2(double_v,  float_v) {
#ifdef Vc_IMPL_AVX
    return _mm256_cvtpd_ps(AVX::concat(x0.data(), x1.data()));
#else
    return _mm_movelh_ps(_mm_cvtpd_ps(x0.data()), _mm_cvtpd_ps(x1.data()));
#endif
}

Vc_SIMD_CAST_2(   int_v,  short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(  uint_v,  short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( float_v,  short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0).data(), simd_cast<SSE::int_v>(x1).data()); }
Vc_SIMD_CAST_2(double_v,  short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), _mm_setzero_si128()); }

Vc_SIMD_CAST_2(   int_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(  uint_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0), simd_cast<SSE::int_v>(x1)); }
Vc_SIMD_CAST_2(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1)); }

// 3 SSE::Vector to 1 SSE::Vector {{{2
Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c)
{
    return simd_cast<short_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
}
Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c)
{
    return simd_cast<ushort_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
}
#undef Vc_CAST_

// 4 SSE::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_4(double_v,  short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), simd_cast<SSE::int_v>(x2, x3).data()); }
Vc_SIMD_CAST_4(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1), simd_cast<SSE::int_v>(x2, x3)); }
}  // namespace SSE

// 1 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::double_v>::value> )
{
    return _mm_setr_pd(x.data(), 0.);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::float_v>::value> )
{
    return _mm_setr_ps(x.data(), 0.f, 0.f, 0.f);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::int_v>::value> )
{
    return _mm_setr_epi32(x.data(), 0, 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
    return _mm_setr_epi32(uint(x.data()), 0, 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::short_v>::value> )
{
    return _mm_setr_epi16(
        x.data(), 0, 0, 0, 0, 0, 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x,
              enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
    return _mm_setr_epi16(
        x.data(), 0, 0, 0, 0, 0, 0, 0);  // FIXME: use register-register mov
}

// 2 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::double_v>::value> )
{
    return _mm_setr_pd(x0.data(), x1.data());  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::float_v>::value> )
{
    return _mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::int_v>::value> )
{
    return _mm_setr_epi32(x0.data(), x1.data(), 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
    return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), 0,
                          0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::short_v>::value> )
{
    return _mm_setr_epi16(
        x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
    return _mm_setr_epi16(
        x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);  // FIXME: use register-register mov
}

// 3 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, SSE::float_v>::value>)
{
    return _mm_setr_ps(x0.data(), x1.data(), x2.data(), 0.f);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, SSE::int_v>::value>)
{
    return _mm_setr_epi32(x0.data(), x1.data(), x2.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, SSE::uint_v>::value>)
{
    return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
                          0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, SSE::short_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
}

// 4 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              enable_if<std::is_same<Return, SSE::float_v>::value> )
{
    return _mm_setr_ps(
        x0.data(), x1.data(), x2.data(), x3.data());  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              enable_if<std::is_same<Return, SSE::int_v>::value> )
{
    return _mm_setr_epi32(
        x0.data(), x1.data(), x2.data(), x3.data());  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
    return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
                          uint(x3.data()));  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              enable_if<std::is_same<Return, SSE::short_v>::value> )
{
    return _mm_setr_epi16(
        x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
    return _mm_setr_epi16(
        x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);  // FIXME: use register-register mov
}

// 5 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, SSE::short_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}

// 6 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, SSE::short_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
                          x5.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
                          x5.data(), 0, 0);
}

// 7 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::short_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
                          x5.data(), x6.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
    return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
                          x5.data(), x6.data(), 0);
}

// 8 Scalar::Vector to 1 SSE::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              Scalar::Vector<T> x4,
              Scalar::Vector<T> x5,
              Scalar::Vector<T> x6,
              Scalar::Vector<T> x7,
              enable_if<std::is_same<Return, SSE::short_v>::value> )
{
    return _mm_setr_epi16(x0.data(),
                          x1.data(),
                          x2.data(),
                          x3.data(),
                          x4.data(),
                          x5.data(),
                          x6.data(),
                          x7.data());  // FIXME: use register-register mov
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Vector<T> x0,
              Scalar::Vector<T> x1,
              Scalar::Vector<T> x2,
              Scalar::Vector<T> x3,
              Scalar::Vector<T> x4,
              Scalar::Vector<T> x5,
              Scalar::Vector<T> x6,
              Scalar::Vector<T> x7,
              enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
    return _mm_setr_epi16(x0.data(),
                          x1.data(),
                          x2.data(),
                          x3.data(),
                          x4.data(),
                          x5.data(),
                          x6.data(),
                          x7.data());  // FIXME: use register-register mov
}

// SSE::Vector to Scalar::Vector {{{2
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
    simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> )
{
    return static_cast<To>(x[0]);
}

// Mask casts without offset {{{1
// 1 SSE Mask to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(SSE::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
    using M = SSE::Mask<T>;
    return {Detail::mask_cast<M::Size, Return::Size, __m128>(x.dataI())};
}
// 2 SSE Masks to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    SSE::Mask<T> x0,
    SSE::Mask<T> x1,
    enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 2 == Return::Size> = nullarg)
{
    return SSE::sse_cast<__m128>(_mm_packs_epi16(x0.dataI(), x1.dataI()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    SSE::Mask<T> x0,
    SSE::Mask<T> x1,
    enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
{
    return SSE::sse_cast<__m128>(
        _mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_setzero_si128()));
}
// 4 SSE Masks to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    SSE::Mask<T> x0,
    SSE::Mask<T> x1,
    SSE::Mask<T> x2,
    SSE::Mask<T> x3,
    enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
{
    return SSE::sse_cast<__m128>(_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()),
                                                 _mm_packs_epi16(x2.dataI(), x3.dataI())));
}

// 1 Scalar Mask to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
    Return m(false);
    m[0] = x[0];
    return m;
}
// 2 Scalar Masks to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(Scalar::Mask<T> x0, Scalar::Mask<T> x1, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
    Return m(false);
    m[0] = x0[0];
    m[1] = x1[0];
    return m;
}
// 4 Scalar Masks to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
                                       Scalar::Mask<T> x1,
                                       Scalar::Mask<T> x2,
                                       Scalar::Mask<T> x3,
                                       enable_if<SSE::is_mask<Return>::value> = nullarg)
{
    Return m(false);
    m[0] = x0[0];
    m[1] = x1[0];
    if (Return::Size >= 4) {
        m[2] = x2[0];
        m[3] = x3[0];
    }
    return m;
}
// 8 Scalar Masks to 1 SSE Mask {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
                                       Scalar::Mask<T> x1,
                                       Scalar::Mask<T> x2,
                                       Scalar::Mask<T> x3,
                                       Scalar::Mask<T> x4,
                                       Scalar::Mask<T> x5,
                                       Scalar::Mask<T> x6,
                                       Scalar::Mask<T> x7,
                                       enable_if<SSE::is_mask<Return>::value> = nullarg)
{
    Return m(false);
    m[0] = x0[0];
    m[1] = x1[0];
    if (Return::Size >= 4) {
        m[2] = x2[0];
        m[3] = x3[0];
    }
    if (Return::Size >= 8) {
        m[4] = x4[0];
        m[5] = x5[0];
        m[6] = x6[0];
        m[7] = x7[0];
    }
    return m;
}

// 1 SSE::Mask to 1 Scalar::Mask {{{2
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
    simd_cast(SSE::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
{
    return static_cast<To>(x[0]);
}
// offset == 0 | convert from SSE::Mask/Vector to SSE::Mask/Vector {{{1
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(V &&x, enable_if<offset == 0 && ((SSE::is_vector<Traits::decay<V>>::value &&
                                                SSE::is_vector<Return>::value) ||
                                               (SSE::is_mask<Traits::decay<V>>::value &&
                                                SSE::is_mask<Return>::value))> = nullarg)
{
    return simd_cast<Return>(x);
}

template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(V &&x,
              enable_if<offset == 0 && ((Scalar::is_vector<Traits::decay<V>>::value &&
                                         SSE::is_vector<Return>::value) ||
                                        (Scalar::is_mask<Traits::decay<V>>::value &&
                                         SSE::is_mask<Return>::value))> = nullarg)
{
    return simd_cast<Return>(x);
}

// Vector casts with offset {{{1
// SSE to SSE (Vector) {{{2
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    V x,
    enable_if<offset != 0 && (SSE::is_vector<Return>::value && SSE::is_vector<V>::value)> = nullarg)
{
    constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
    static_assert(shift > 0 && shift < 16, "");
    return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
        _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
}

// SSE to Scalar (Vector) {{{2
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
    simd_cast(SSE::Vector<T> x,
              enable_if<offset != 0 && Scalar::is_vector<Return>::value> = nullarg)
{
    return static_cast<typename Return::EntryType>(x[offset]);
}

// Mask casts with offset {{{1
// SSE to SSE (Mask)
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return simd_cast(
    V x,
    enable_if<offset != 0 && (SSE::is_mask<Return>::value && SSE::is_mask<V>::value)> = nullarg)
{
    constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
    static_assert(shift > 0 && shift < 16, "");
    return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
        _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
}

// undef Vc_SIMD_CAST_[1248] {{{1
#undef Vc_SIMD_CAST_1
#undef Vc_SIMD_CAST_2
#undef Vc_SIMD_CAST_4
#undef Vc_SIMD_CAST_8
// }}}1

}  // namespace Vc

#endif // VC_SSE_SIMD_CAST_H_

// vim: foldmethod=marker

#endif // VC_SSE_VECTOR_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_VECTORHELPER_H_
#define VC_AVX_VECTORHELPER_H_

#include <limits>

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
        template<> struct VectorHelper<__m256>
        {
            typedef __m256 VectorType;
            typedef const VectorType VTArg;

            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_ps(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }

            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
        };

        template<> struct VectorHelper<__m256d>
        {
            typedef __m256d VectorType;
            typedef const VectorType VTArg;

            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_pd(mem, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }

            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
        };

        template<> struct VectorHelper<__m256i>
        {
            typedef __m256i VectorType;
            typedef const VectorType VTArg;

            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }

            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
        };

#define Vc_OP1(op) \
        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
#define Vc_OP(op) \
        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
#define Vc_OP_(op) \
        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op    , Vc_SUFFIX)(a, b); }
#define Vc_OPx(op, op2) \
        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }

        template<> struct VectorHelper<double> {
            typedef __m256d VectorType;
            typedef const VectorType VTArg;
            typedef double EntryType;
#define Vc_SUFFIX pd

            static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
            static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
                return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
            }
            static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); }

            static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
                v1 = _mm256_macc_pd(v1, v2, v3);
#else
                VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
                VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
#if defined(Vc_GCC) && Vc_GCC < 0x40703
                // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
                // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
                asm("":"+x"(h1), "+x"(h2));
#endif
                const VectorType l1 = _mm256_sub_pd(v1, h1);
                const VectorType l2 = _mm256_sub_pd(v2, h2);
                const VectorType ll = mul(l1, l2);
                const VectorType lh = add(mul(l1, h2), mul(h1, l2));
                const VectorType hh = mul(h1, h2);
                // ll < lh < hh for all entries is certain
                const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
                const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
                const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
                v1 = add(add(ll, b), add(c, hh));
#endif
            }

            static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
            static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
            static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }

            Vc_OP1(sqrt)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
                return _mm256_div_pd(one(), sqrt(x));
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
                return _mm256_div_pd(one(), x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
                return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
            }

            static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
            static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
                __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
                b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
                return _mm_cvtsd_f64(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
                __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
                b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
                return _mm_cvtsd_f64(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
                __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
                b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
                return _mm_cvtsd_f64(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
                __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
                b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
                return _mm_cvtsd_f64(b);
            }
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
                return _mm256_round_pd(a, _MM_FROUND_NINT);
            }
        };

        template<> struct VectorHelper<float> {
            typedef float EntryType;
            typedef __m256 VectorType;
            typedef const VectorType VTArg;
#define Vc_SUFFIX ps

            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
                    const float e, const float f, const float g, const float h) {
                return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); }
            static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }

            static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
                v1 = _mm256_macc_ps(v1, v2, v3);
#else
                __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
                __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
                __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
                __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
                __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
                __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
                v1 = AVX::concat(
                        _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
                        _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
#endif
            }

            static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
            static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
            static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }

            Vc_OP1(sqrt) Vc_OP1(rsqrt)
            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
                return _mm256_rcp_ps(x);
            }
            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
                return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
            }

            static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
            static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
                __m128 b = _mm_min_ps(lo128(a), hi128(a));
                b = _mm_min_ps(b, _mm_movehl_ps(b, b));   // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
                b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
                return _mm_cvtss_f32(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
                __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
                b = _mm_max_ps(b, _mm_movehl_ps(b, b));   // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
                b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
                return _mm_cvtss_f32(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
                __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
                b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
                b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
                return _mm_cvtss_f32(b);
            }
            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
                __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
                b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
                b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
                return _mm_cvtss_f32(b);
            }
#undef Vc_SUFFIX
            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
                return _mm256_round_ps(a, _MM_FROUND_NINT);
            }
        };

#undef Vc_OP1
#undef Vc_OP
#undef Vc_OP_
#undef Vc_OPx

}  // namespace AVX(2)
}  // namespace Vc

#endif // VC_AVX_VECTORHELPER_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_MASK_H_
#define VC_AVX_MASK_H_

#include <array>

/*  This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_DETAIL_H_
#define VC_AVX_DETAIL_H_


namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// (converting) load functions {{{1
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
                                 typename Flags::EnableIfAligned = nullptr)
{
    return _mm256_load_ps(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
                                 typename Flags::EnableIfUnaligned = nullptr)
{
    return _mm256_loadu_ps(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
                                 typename Flags::EnableIfStreaming = nullptr)
{
    return AvxIntrinsics::stream_load<__m256>(x);
}

template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
                                  typename Flags::EnableIfAligned = nullptr)
{
    return _mm256_load_pd(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
                                  typename Flags::EnableIfUnaligned = nullptr)
{
    return _mm256_loadu_pd(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
                                  typename Flags::EnableIfStreaming = nullptr)
{
    return AvxIntrinsics::stream_load<__m256d>(x);
}

template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
}
template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
{
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
}
template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
{
    return AvxIntrinsics::stream_load<__m256i>(x);
}

// load32{{{2
Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
{
    return _mm256_load_ps(mem);
}
Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
{
    return _mm256_loadu_ps(mem);
}
Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
{
    return AvxIntrinsics::stream_load<__m256>(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
{
    return _mm256_load_pd(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
{
    return _mm256_loadu_pd(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
{
    return AvxIntrinsics::stream_load<__m256d>(mem);
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
{
    static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
{
    static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
{
    static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
    return AvxIntrinsics::stream_load<__m256i>(mem);
}

// MSVC workarounds{{{2
#ifdef Vc_MSVC
// work around: "fatal error C1001: An internal error has occurred in the compiler."
Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
{
    return _mm256_loadu_pd(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
                         enable_if<(std::is_same<DstT, float>::value &&
                                    std::is_same<V, __m256>::value)> = nullarg)
{
    return _mm256_load_ps(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
                         enable_if<(std::is_same<DstT, float>::value &&
                                    std::is_same<V, __m256>::value)> = nullarg)
{
    return _mm256_loadu_ps(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
                         enable_if<(std::is_same<DstT, float>::value &&
                                    std::is_same<V, __m256>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
                          enable_if<(std::is_same<DstT, double>::value &&
                                     std::is_same<V, __m256d>::value)> = nullarg)
{
    return _mm256_load_pd(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
                          enable_if<(std::is_same<DstT, double>::value &&
                                     std::is_same<V, __m256d>::value)> = nullarg)
{
    return _mm256_loadu_pd(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
                          enable_if<(std::is_same<DstT, double>::value &&
                                     std::is_same<V, __m256d>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256d>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
                          enable_if<(std::is_same<DstT, uint>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
                          enable_if<(std::is_same<DstT, uint>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
                          enable_if<(std::is_same<DstT, uint>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256i>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
                          enable_if<(std::is_same<DstT, int>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
                          enable_if<(std::is_same<DstT, int>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
                          enable_if<(std::is_same<DstT, int>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256i>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
                          enable_if<(std::is_same<DstT, short>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
                          enable_if<(std::is_same<DstT, short>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
                          enable_if<(std::is_same<DstT, short>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256i>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
                          enable_if<(std::is_same<DstT, ushort>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
                          enable_if<(std::is_same<DstT, ushort>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
                          enable_if<(std::is_same<DstT, ushort>::value &&
                                     std::is_same<V, __m256i>::value)> = nullarg)
{
    return AvxIntrinsics::stream_load<__m256i>(mem);
}

#endif  // Vc_MSVC

// short {{{2
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
{
    return load32(mem, f);
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
{
    return AVX::cvtepu8_epi16(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
{
    return AVX::cvtepi8_epi16(load16(mem, f));
}

// ushort {{{2
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
{
    return AVX::cvtepu8_epi16(load16(mem, f));
}

// int {{{2
template <typename Flags>
Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
{
    return load32(mem, f);
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
{
    return AVX::cvtepu16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
{
    return AVX::cvtepi16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
{
    return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
{
    return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}

// uint {{{2
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
{
    return AVX::cvtepu16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
{
    return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}

// double {{{2
template <typename Flags>
Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<float, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<uint, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
{
    return AVX::convert<int, double>(load16(mem, f));
}

// float {{{2
template <typename Flags>
Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
{
    return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
                       _mm256_cvtpd_ps(load32(&mem[4], f)));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
{
    const auto v = load32(mem, f);
    return _mm256_blendv_ps(
        _mm256_cvtepi32_ps(v),
        _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
                      AVX::set2power31_ps()),
        _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
{
    return AVX::convert<int, float>(load32(mem, f));
}
template <typename T, typename Flags,
          typename = enable_if<!std::is_same<T, float>::value>>
Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
{
    return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
{
    return AVX::convert<ushort, float>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
{
    return AVX::convert<short, float>(load16(mem, f));
}
/*
template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
    static __m256 load(const unsigned char *mem, Flags)
    {
        return _mm256_cvtepi32_ps(
            cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))));
    }
};
template<typename Flags> struct LoadHelper<float, signed char, Flags> {
    static __m256 load(const signed char *mem, Flags)
    {
        return _mm256_cvtepi32_ps(
            cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))));
    }
};
*/

// shifted{{{1
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
{
    return AVX::avx_cast<T>(AVX::zeroExtend(
        _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
shifted(T k)
{
    return AVX::avx_cast<T>(
        AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
                            AVX::avx_cast<__m256i>(k)));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
{
    return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
        _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
shifted(T k)
{
    return AVX::avx_cast<T>(
        AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
                                 Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
}
// mask_cast{{{1
template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
{
    static_assert(From == To, "Incorrect mask cast.");
    static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
    return AVX::avx_cast<__m256>(k);
}

// 4 -> 4
template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
{
    return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
}

template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
{
    const auto kk = _mm_castsi128_ps(k);
    return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
}

// 4 -> 8
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
{
    // aabb ccdd -> abcd 0000
    return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
                                 _mm_setzero_si128()));
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
{
    // aaaa bbbb cccc dddd -> abcd 0000
    return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
}

template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
{
    return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
}

// 4 -> 16
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
{
    // aaaa bbbb cccc dddd -> abcd 0000 0000 0000
    return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
}

// 8 -> 4
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
{
    // aabb ccdd eeff gghh -> aaaa bbbb cccc dddd
    const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
    return AVX::concat(_mm_unpacklo_ps(lo, lo),
                  _mm_unpackhi_ps(lo, lo));
}

template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
{
    return AVX::avx_cast<__m128>(AVX::lo128(k));
}

template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
{
    // abcd efgh -> aaaa bbbb cccc dddd
    const auto tmp = _mm_unpacklo_epi16(k, k); // aa bb cc dd
    return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), // aaaa bbbb
                                 _mm_unpackhi_epi32(tmp, tmp))); // cccc dddd
}

// 8 -> 8
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
{
    // aabb ccdd eeff gghh -> abcd efgh
    return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
}

template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
{
    return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
                                 _mm_unpackhi_epi16(k, k)));
}

// 8 -> 16
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
{
    // aabb ccdd eeff gghh -> abcd efgh 0000 0000
    return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
}

// 16 -> 8
#ifdef Vc_IMPL_AVX2
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
{
    // abcd efgh ijkl mnop -> aabb ccdd eeff gghh
    const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
    return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
}
#endif

// 16 -> 4
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
{
    // abcd efgh ijkl mnop -> aaaa bbbb cccc dddd
    const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k)); // aabb ccdd
    return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
}

// allone{{{1
template<> Vc_INTRINSIC Vc_CONST __m256  allone<__m256 >() { return AVX::setallone_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }

// zero{{{1
template<> Vc_INTRINSIC Vc_CONST __m256  zero<__m256 >() { return _mm256_setzero_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }

// one{{{1
Vc_INTRINSIC Vc_CONST __m256  one( float) { return AVX::setone_ps   (); }
Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd   (); }
Vc_INTRINSIC Vc_CONST __m256i one(   int) { return AVX::setone_epi32(); }
Vc_INTRINSIC Vc_CONST __m256i one(  uint) { return AVX::setone_epu32(); }
Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }

// negate{{{1
Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
{
    return _mm256_xor_ps(v, AVX::setsignmask_ps());
}
Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
{
    return _mm256_xor_pd(v, AVX::setsignmask_pd());
}
Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
{
    return AVX::sign_epi32(v, Detail::allone<__m256i>());
}
Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
{
    return AVX::sign_epi16(v, Detail::allone<__m256i>());
}

// xor_{{{1
Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
    return _mm256_xor_si256(a, b);
#else
    return _mm256_castps_si256(
        _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}

// or_{{{1
Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
    return _mm256_or_si256(a, b);
#else
    return _mm256_castps_si256(
        _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}

// and_{{{1
Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
#ifdef Vc_IMPL_AVX2
    return _mm256_and_si256(a, b);
#else
    return _mm256_castps_si256(
        _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}

// andnot_{{{1
Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
    return _mm256_andnot_si256(a, b);
#else
    return _mm256_castps_si256(
        _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}

// not_{{{1
Vc_INTRINSIC __m256  not_(__m256  a) { return andnot_(a, allone<__m256 >()); }
Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }

// blend{{{1
Vc_INTRINSIC __m256  blend(__m256  a, __m256  b, __m256  c) { return _mm256_blendv_ps(a, b, c); }
Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }

// abs{{{1
Vc_INTRINSIC __m256  abs(__m256  a,  float) { return and_(a, AVX::setabsmask_ps()); }
Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
Vc_INTRINSIC __m256i abs(__m256i a,    int) { return AVX::abs_epi32(a); }
Vc_INTRINSIC __m256i abs(__m256i a,   uint) { return a; }
Vc_INTRINSIC __m256i abs(__m256i a,  short) { return AVX::abs_epi16(a); }
Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
Vc_INTRINSIC __m256i abs(__m256i a,  schar) { return AVX::abs_epi8 (a); }
Vc_INTRINSIC __m256i abs(__m256i a,  uchar) { return a; }

// add{{{1
Vc_INTRINSIC __m256  add(__m256  a, __m256  b,  float) { return _mm256_add_ps(a, b); }
Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b,    int) { return AVX::add_epi32(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b,   uint) { return AVX::add_epi32(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b,  short) { return AVX::add_epi16(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }

// sub{{{1
Vc_INTRINSIC __m256  sub(__m256  a, __m256  b,  float) { return _mm256_sub_ps(a, b); }
Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,    int) { return AVX::sub_epi32(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,   uint) { return AVX::sub_epi32(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,  short) { return AVX::sub_epi16(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }

// mul{{{1
Vc_INTRINSIC __m256  mul(__m256  a, __m256  b,  float) { return _mm256_mul_ps(a, b); }
Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,    int) { return AVX::mullo_epi32(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,   uint) { return AVX::mullo_epi32(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,  short) { return AVX::mullo_epi16(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }

// mul{{{1
Vc_INTRINSIC __m256  div(__m256  a, __m256  b,  float) { return _mm256_div_ps(a, b); }
Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
Vc_INTRINSIC __m256i div(__m256i a, __m256i b,    int) {
    using namespace AVX;
    const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
    const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
    const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
    const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
    return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
                  _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
}
Vc_INTRINSIC __m256i div(__m256i a, __m256i b,   uint) {
    // SSE/AVX only has signed int conversion to doubles. Therefore we first adjust the input before
    // conversion and take the adjustment back after the conversion.
    // It could be argued that for b this is not really important because division by a b >= 2^31 is
    // useless. But for full correctness it cannot be ignored.
    using namespace AVX;
    const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
    const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
    const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
    const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
    const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
    const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
    // there is one remaining problem: a >= 2^31 and b == 1
    // in that case the return value would be 2^31
    return avx_cast<__m256i>(_mm256_blendv_ps(
        avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
                                          _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
        avx_cast<__m256>(a),
        avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
}
Vc_INTRINSIC __m256i div(__m256i a, __m256i b,  short) {
    using namespace AVX;
    const __m256 lo =
        _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
    const __m256 hi =
        _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
    return concat(convert<float, short>(lo), convert<float, short>(hi));
}

// horizontal add{{{1
template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
    return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
}

// horizontal mul{{{1
template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
    return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
}

// horizontal min{{{1
template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
    return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
}

// horizontal max{{{1
template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
    return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
}
// cmpeq{{{1
Vc_INTRINSIC __m256  cmpeq(__m256  a, __m256  b,  float) { return AvxIntrinsics::cmpeq_ps(a, b); }
Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,    int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,   uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,  short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }

// cmpneq{{{1
Vc_INTRINSIC __m256  cmpneq(__m256  a, __m256  b,  float) { return AvxIntrinsics::cmpneq_ps(a, b); }
Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,    int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,   uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }

// cmpgt{{{1
Vc_INTRINSIC __m256  cmpgt(__m256  a, __m256  b,  float) { return AVX::cmpgt_ps(a, b); }
Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,    int) { return AVX::cmpgt_epi32(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,   uint) { return AVX::cmpgt_epu32(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  short) { return AVX::cmpgt_epi16(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  schar) { return AVX::cmpgt_epi8 (a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  uchar) { return AVX::cmpgt_epu8 (a, b); }

// cmpge{{{1
Vc_INTRINSIC __m256  cmpge(__m256  a, __m256  b,  float) { return AVX::cmpge_ps(a, b); }
Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,    int) { return not_(AVX::cmpgt_epi32(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,   uint) { return not_(AVX::cmpgt_epu32(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  short) { return not_(AVX::cmpgt_epi16(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }

// cmple{{{1
Vc_INTRINSIC __m256  cmple(__m256  a, __m256  b,  float) { return AVX::cmple_ps(a, b); }
Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,    int) { return not_(AVX::cmpgt_epi32(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,   uint) { return not_(AVX::cmpgt_epu32(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  short) { return not_(AVX::cmpgt_epi16(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }

// cmplt{{{1
Vc_INTRINSIC __m256  cmplt(__m256  a, __m256  b,  float) { return AVX::cmplt_ps(a, b); }
Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,    int) { return AVX::cmpgt_epi32(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,   uint) { return AVX::cmpgt_epu32(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  short) { return AVX::cmpgt_epi16(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  schar) { return AVX::cmpgt_epi8 (b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  uchar) { return AVX::cmpgt_epu8 (b, a); }

// fma{{{1
Vc_INTRINSIC __m256 fma(__m256  a, __m256  b, __m256  c,  float) {
#ifdef Vc_IMPL_FMA4
    return _mm256_macc_ps(a, b, c);
#elif defined Vc_IMPL_FMA
    return _mm256_fmadd_ps(a, b, c);
#else
    using namespace AVX;
    __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
    __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
    __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
    __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
    __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
    __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
    return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
                  _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
#endif
}
Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
{
#ifdef Vc_IMPL_FMA4
    return _mm256_macc_pd(a, b, c);
#elif defined Vc_IMPL_FMA
    return _mm256_fmadd_pd(a, b, c);
#else
    using namespace AVX;
    __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
                             &c_general::highMaskDouble)));
    __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
                             &c_general::highMaskDouble)));
    const __m256d l1 = _mm256_sub_pd(a, h1);
    const __m256d l2 = _mm256_sub_pd(b, h2);
    const __m256d ll = mul(l1, l2, double());
    const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
    const __m256d hh = mul(h1, h2, double());
    // ll < lh < hh for all entries is certain
    const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());  // |lh| < |c|
    const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
    const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
    return add(add(ll, x, double()), add(y, hh, double()), double());
#endif
}
template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
{
    return add(mul(a, b, T()), c, T());
}

// shiftRight{{{1
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,    int) { return AVX::srai_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,   uint) { return AVX::srli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  short) { return AVX::srai_epi16<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
//template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  schar) { return AVX::srai_epi8 <shift>(a); }
//template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  uchar) { return AVX::srli_epi8 <shift>(a); }

Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,    int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,   uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
//Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  schar) { return AVX::sra_epi8 (a, _mm_cvtsi32_si128(shift)); }
//Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  uchar) { return AVX::srl_epi8 (a, _mm_cvtsi32_si128(shift)); }

// shiftLeft{{{1
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,    int) { return AVX::slli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,   uint) { return AVX::slli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  short) { return AVX::slli_epi16<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
//template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  schar) { return AVX::slli_epi8 <shift>(a); }
//template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  uchar) { return AVX::slli_epi8 <shift>(a); }

Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,    int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,   uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
//Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  schar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); }
//Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  uchar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); }

// zeroExtendIfNeeded{{{1
Vc_INTRINSIC __m256  zeroExtendIfNeeded(__m256  x) { return x; }
Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
Vc_INTRINSIC __m256  zeroExtendIfNeeded(__m128  x) { return AVX::zeroExtend(x); }
Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }

// broadcast{{{1
Vc_INTRINSIC __m256  avx_broadcast( float x) { return _mm256_set1_ps(x); }
Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
Vc_INTRINSIC __m256i avx_broadcast(   int x) { return _mm256_set1_epi32(x); }
Vc_INTRINSIC __m256i avx_broadcast(  uint x) { return _mm256_set1_epi32(x); }
Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
Vc_INTRINSIC __m256i avx_broadcast(  char x) { return _mm256_set1_epi8(x); }
Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }

// sorted{{{1
template <Vc::Implementation Impl, typename T,
          typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
Vc_CONST_L AVX2::Vector<T> sorted(AVX2::Vector<T> x) Vc_CONST_R;
template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
{
    return sorted<CurrentImplementation::current()>(x);
}

// shifted{{{1
template <typename T, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
{
    using namespace AVX;
    constexpr int S = sizeof(T);
    switch (amount) {
    case  0: return v;
    case  1: return shifted<sanitize<V>( 1 * S)>(v);
    case  2: return shifted<sanitize<V>( 2 * S)>(v);
    case  3: return shifted<sanitize<V>( 3 * S)>(v);
    case -1: return shifted<sanitize<V>(-1 * S)>(v);
    case -2: return shifted<sanitize<V>(-2 * S)>(v);
    case -3: return shifted<sanitize<V>(-3 * S)>(v);
    }
    if (sizeof(T) <= 4) {
        switch (amount) {
        case  4: return shifted<sanitize<V>( 4 * S)>(v);
        case  5: return shifted<sanitize<V>( 5 * S)>(v);
        case  6: return shifted<sanitize<V>( 6 * S)>(v);
        case  7: return shifted<sanitize<V>( 7 * S)>(v);
        case -4: return shifted<sanitize<V>(-4 * S)>(v);
        case -5: return shifted<sanitize<V>(-5 * S)>(v);
        case -6: return shifted<sanitize<V>(-6 * S)>(v);
        case -7: return shifted<sanitize<V>(-7 * S)>(v);
        }
        if (sizeof(T) <= 2) {
            switch (amount) {
            case   8: return shifted<sanitize<V>(  8 * S)>(v);
            case   9: return shifted<sanitize<V>(  9 * S)>(v);
            case  10: return shifted<sanitize<V>( 10 * S)>(v);
            case  11: return shifted<sanitize<V>( 11 * S)>(v);
            case  12: return shifted<sanitize<V>( 12 * S)>(v);
            case  13: return shifted<sanitize<V>( 13 * S)>(v);
            case  14: return shifted<sanitize<V>( 14 * S)>(v);
            case  15: return shifted<sanitize<V>( 15 * S)>(v);
            case  -8: return shifted<sanitize<V>(- 8 * S)>(v);
            case  -9: return shifted<sanitize<V>(- 9 * S)>(v);
            case -10: return shifted<sanitize<V>(-10 * S)>(v);
            case -11: return shifted<sanitize<V>(-11 * S)>(v);
            case -12: return shifted<sanitize<V>(-12 * S)>(v);
            case -13: return shifted<sanitize<V>(-13 * S)>(v);
            case -14: return shifted<sanitize<V>(-14 * S)>(v);
            case -15: return shifted<sanitize<V>(-15 * S)>(v);
            }
            if (sizeof(T) == 1) {
                switch (amount) {
                case  16: return shifted<sanitize<V>( 16)>(v);
                case  17: return shifted<sanitize<V>( 17)>(v);
                case  18: return shifted<sanitize<V>( 18)>(v);
                case  19: return shifted<sanitize<V>( 19)>(v);
                case  20: return shifted<sanitize<V>( 20)>(v);
                case  21: return shifted<sanitize<V>( 21)>(v);
                case  22: return shifted<sanitize<V>( 22)>(v);
                case  23: return shifted<sanitize<V>( 23)>(v);
                case  24: return shifted<sanitize<V>( 24)>(v);
                case  25: return shifted<sanitize<V>( 25)>(v);
                case  26: return shifted<sanitize<V>( 26)>(v);
                case  27: return shifted<sanitize<V>( 27)>(v);
                case  28: return shifted<sanitize<V>( 28)>(v);
                case  29: return shifted<sanitize<V>( 29)>(v);
                case  30: return shifted<sanitize<V>( 30)>(v);
                case  31: return shifted<sanitize<V>( 31)>(v);
                case -16: return shifted<sanitize<V>(-16)>(v);
                case -17: return shifted<sanitize<V>(-17)>(v);
                case -18: return shifted<sanitize<V>(-18)>(v);
                case -19: return shifted<sanitize<V>(-19)>(v);
                case -20: return shifted<sanitize<V>(-20)>(v);
                case -21: return shifted<sanitize<V>(-21)>(v);
                case -22: return shifted<sanitize<V>(-22)>(v);
                case -23: return shifted<sanitize<V>(-23)>(v);
                case -24: return shifted<sanitize<V>(-24)>(v);
                case -25: return shifted<sanitize<V>(-25)>(v);
                case -26: return shifted<sanitize<V>(-26)>(v);
                case -27: return shifted<sanitize<V>(-27)>(v);
                case -28: return shifted<sanitize<V>(-28)>(v);
                case -29: return shifted<sanitize<V>(-29)>(v);
                case -30: return shifted<sanitize<V>(-30)>(v);
                case -31: return shifted<sanitize<V>(-31)>(v);
                }
            }
        }
    }
    return avx_cast<V>(_mm256_setzero_ps());
}

template <typename T, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
{
    using namespace AVX;
    switch (amount) {
    case  0: return v;
    case  1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
    case  2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
    case  3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
    case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
    case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
    case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
    }
    if (sizeof(T) <= 2) {
        switch (amount) {
        case  4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
        case  5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
        case  6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
        case  7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
        case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
        case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
        case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
        case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
        }
    }
    return avx_cast<V>(_mm_setzero_ps());
}
// rotated{{{1
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
                                                                               int amount)
{
    using namespace AVX;
    const __m128i vLo = avx_cast<__m128i>(lo128(v));
    const __m128i vHi = avx_cast<__m128i>(hi128(v));
    switch (static_cast<unsigned int>(amount) % N) {
    case 0:
        return v;
    case 1:
        return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
    case 2:
        return Mem::permute128<X1, X0>(v);
    case 3:
        return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
    }
    return avx_cast<V>(_mm256_setzero_ps());
}

template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
                                                                               int amount)
{
    using namespace AVX;
    const __m128i vLo = avx_cast<__m128i>(lo128(v));
    const __m128i vHi = avx_cast<__m128i>(hi128(v));
    switch (static_cast<unsigned int>(amount) % N) {
    case 0:
        return v;
    case 1:
        return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
    case 2:
        return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
    case 3:
        return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
    case 4:
        return Mem::permute128<X1, X0>(v);
    case 5:
        return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
    case 6:
        return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
    case 7:
        return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
    }
    return avx_cast<V>(_mm256_setzero_ps());
}

#ifdef Vc_IMPL_AVX2
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
    V v, int amount)
{
    using namespace AVX;
    const __m128i vLo = avx_cast<__m128i>(lo128(v));
    const __m128i vHi = avx_cast<__m128i>(hi128(v));
    switch (static_cast<unsigned int>(amount) % N) {
    case 0:
        return v;
    case 1:
        return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
    case 2:
        return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
    case 3:
        return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
    case 4:
        return Mem::permute4x64<X1, X2, X3, X0>(v);
    case 5:
        return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
    case 6:
        return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
    case 7:
        return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
                                  SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
    case 8:
        return Mem::permute128<X1, X0>(v);
    case 9:
        return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
    case 10:
        return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
    case 11:
        return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
    case 12:
        return Mem::permute4x64<X3, X0, X1, X2>(v);
    case 13:
        return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
    case 14:
        return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
    case 15:
        return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
                                  SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
    }
    return avx_cast<V>(_mm256_setzero_ps());
}
#endif  // Vc_IMPL_AVX2

// testc{{{1
Vc_INTRINSIC Vc_CONST int testc(__m128  a, __m128  b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testc(__m256  a, __m256  b) { return _mm256_testc_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }

// testz{{{1
Vc_INTRINSIC Vc_CONST int testz(__m128  a, __m128  b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testz(__m256  a, __m256  b) { return _mm256_testz_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }

// testnzc{{{1
Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256  a, __m256  b) { return _mm256_testnzc_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }

// movemask{{{1
Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m256  a) { return _mm256_movemask_ps(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128  a) { return _mm_movemask_ps(a); }

// mask_store{{{1
template <size_t N, typename Flags>
Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
{
    static_assert(
        N == 4 || N == 8 || N == 16,
        "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
    switch (N) {
    case 4:
        *reinterpret_cast<MayAlias<int32_t> *>(mem) =
            (_mm_movemask_epi8(AVX::lo128(k)) |
             (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
            0x01010101;
        break;
    case 8: {
        const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
        const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
#ifdef __x86_64__
        *reinterpret_cast<MayAlias<int64_t> *>(mem) = _mm_cvtsi128_si64(k3);
#else
        *reinterpret_cast<MayAlias<int32_t> *>(mem) = _mm_cvtsi128_si32(k3);
        *reinterpret_cast<MayAlias<int32_t> *>(mem + 4) = _mm_extract_epi32(k3, 1);
#endif
    } break;
    case 16: {
        const auto bools = Detail::and_(AVX::_mm_setone_epu8(),
                                        _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
        if (Flags::IsAligned) {
            _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
        } else {
            _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
        }
    } break;
    default:
        Vc_UNREACHABLE();
    }
}

// mask_load{{{1
template <typename R, size_t N, typename Flags>
Vc_INTRINSIC R mask_load(const bool *mem, Flags,
                         enable_if<std::is_same<R, __m128>::value> = nullarg)
{
    static_assert(N == 4 || N == 8,
                  "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
    switch (N) {
    case 4: {
        __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const MayAlias<int32_t> *>(mem));
        k = _mm_unpacklo_epi8(k, k);
        k = _mm_unpacklo_epi16(k, k);
        k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
        return AVX::avx_cast<__m128>(k);
    }
    case 8: {
#ifdef __x86_64__
        __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const MayAlias<int64_t> *>(mem));
#else
        __m128i k = _mm_castpd_si128(
            _mm_load_sd(reinterpret_cast<const MayAlias<double> *>(mem)));
#endif
        return AVX::avx_cast<__m128>(
            _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
    }
    default:
        Vc_UNREACHABLE();
    }
}

template <typename R, size_t N, typename Flags>
Vc_INTRINSIC R mask_load(const bool *mem, Flags,
                         enable_if<std::is_same<R, __m256>::value> = nullarg)
{
    static_assert(
        N == 4 || N == 8 || N == 16,
        "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
    switch (N) {
    case 4: {
        __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
            _mm_set1_ps(*reinterpret_cast<const MayAlias<float> *>(mem)),
            AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
        k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
        return AVX::avx_cast<__m256>(
            AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
    }
    case 8: {
#ifdef __x86_64__
        __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const MayAlias<int64_t> *>(mem));
#else
        __m128i k = _mm_castpd_si128(
            _mm_load_sd(reinterpret_cast<const MayAlias<double> *>(mem)));
#endif
        k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
        return AVX::avx_cast<__m256>(
            AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
    }
    case 16: {
        const auto k128 = _mm_cmpgt_epi8(
            Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
                             : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
            _mm_setzero_si128());
        return AVX::avx_cast<__m256>(
            AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
    }
    default:
        Vc_UNREACHABLE();
        return R();
    }
}

// mask_to_int{{{1
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
{
    return movemask(AVX::avx_cast<__m256d>(k));
}
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
{
    return movemask(AVX::avx_cast<__m256>(k));
}
#ifdef Vc_IMPL_BMI2
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
{
    return _pext_u32(movemask(k), 0x55555555u);
}
#endif
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
{
    return movemask(k);
}

//InterleaveImpl{{{1
template<typename V> struct InterleaveImpl<V, 16, 32> {
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ...
            const typename V::AsArg v1) // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ...
    {
        const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ...
        const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ...
        using namespace AVX;
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
        *reinterpret_cast<MayAlias<uint32_t> *>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
    }/*}}}*/
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ...
        const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ...
        V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
        V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
    {
        interleave(data, i, v0, v1);
        v2.scatter(data + 2, i);
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1,
            const typename V::AsArg v2, const typename V::AsArg v3)
    {
        const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ...
        const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ...
        const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ...
        const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ...

        const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ...
        const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11
        const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13
        const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15

        using namespace AVX;
        auto &&store = [&](__m256i x, int offset) {
            _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
            _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
            _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
            _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
        };
        store(tmp4, 0);
        store(tmp5, 2);
        store(tmp6, 4);
        store(tmp7, 6);
    }/*}}}*/
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1,
            const typename V::AsArg v2, const typename V::AsArg v3)
    {
        const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ...
        const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ...
        const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ...
        const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ...

        const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ...
        const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11
        const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13
        const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15

        V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
        V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
        V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
        V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
    }/*}}}*/
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1)
    {
        const __m256i tmp4 =  // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 b9 a10 b10 a11 b11
            _mm256_setr_epi32(*reinterpret_cast<const MayAlias<int> *>(&data[i[0]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[1]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[2]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[3]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[8]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[9]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[10]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[11]]));
        const __m256i tmp5 = // a4 b4 a5 b5 a6 b6 a7 b7 | a12 b12 a13 b13 a14 b14 a15 b15
            _mm256_setr_epi32(*reinterpret_cast<const MayAlias<int> *>(&data[i[4]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[5]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[6]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[7]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[12]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[13]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[14]]),
                              *reinterpret_cast<const MayAlias<int> *>(&data[i[15]]));

        const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);  // a0 a4 b0 b4 a1 a5 b1 b5 | a8 a12 b8 b12 a9 a13 b9 b13
        const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);  // a2 a6 b2 b6 a3 a7 b3 b7 | a10 a14 b10 b14 a11 a15 b11 b15

        const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);  // a0 a2 a4 a6 b0 b2 b4 b6 | a8 a10 a12 a14 b8 ...
        const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);  // a1 a3 a5 a7 b1 b3 b5 b7 | a9 a11 a13 a15 b9 ...

        v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ...
        v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ...
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        using namespace AVX;
        const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[0]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[1]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[8]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[9]])));
        const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[2]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[3]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[10]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[11]])));
        const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[4]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[5]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[12]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[13]])));
        const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[6]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[7]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[14]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[15]])));
        const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 XX XX | a8 a12 b8 ...
        const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ...
        const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ...
        const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ...

        const __m256i tmp8  = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ...
        const __m256i tmp9  = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 XX ...
        const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ...
        const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 XX ...

        v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ...
        v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
        v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        using namespace AVX;
        const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[0]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[1]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[8]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[9]])));
        const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[2]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[3]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[10]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[11]])));
        const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[4]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[5]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[12]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[13]])));
        const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast<const MayAlias<double> *>(&data[i[6]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[7]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[14]]),
                                                              *reinterpret_cast<const MayAlias<double> *>(&data[i[15]])));
        const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 a12 b8 ...
        const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ...
        const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ...
        const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ...

        const __m256i tmp8  = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ...
        const __m256i tmp9  = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 d0 ...
        const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ...
        const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 d1 ...

        v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ...
        v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
        v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
        v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        using namespace AVX;
        const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
        const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
        const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
        const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
        const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
        const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
        const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
        const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));

        const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
        const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
        const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7

        v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
        v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
        v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
        v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
        v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        using namespace AVX;
        const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
        const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
        const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
        const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
        const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
        const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
        const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
        const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));

        const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
        const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
        const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7

        v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
        v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
        v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
        v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
        v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
        v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        using namespace AVX;
        const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
        const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
        const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
        const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
        const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
        const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
        const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
        const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));

        const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
        const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
        const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
        const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
        const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7

        v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
        v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
        v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
        v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
        v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
        v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
        v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        using namespace AVX;
        const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
        const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
        const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
        const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
        const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
        const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
        const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
        const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
                                 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));

        const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
        const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
        const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
        const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
        const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
        const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
        const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
        const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7

        const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
        const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
        const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
        const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
        const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
        const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
        const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
        const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7

        v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
        v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
        v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
        v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
        v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
        v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
        v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
        v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
    }/*}}}*/
};
template<typename V> struct InterleaveImpl<V, 8, 32> {
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        using namespace AVX;
        // [0a 1a 0b 1b 0e 1e 0f 1f]:
        const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
        // [0c 1c 0d 1d 0g 1g 0h 1h]:
        const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
        _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
        _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
    }/*}}}*/
    static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1)
    {
        using namespace AVX;
        // [0a 1a 0b 1b 0e 1e 0f 1f]:
        const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
        // [0c 1c 0d 1d 0g 1g 0h 1h]:
        const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[0]]), lo128(tmp0));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[2]]), lo128(tmp1));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[4]]), hi128(tmp0));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[6]]), hi128(tmp1));
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
    {
        using namespace AVX;
#ifdef Vc_USE_MASKMOV_SCATTER
        // [0a 2a 0b 2b 0e 2e 0f 2f]:
        const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
        // [0c 2c 0d 2d 0g 2g 0h 2h]:
        const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
        // [1a __ 1b __ 1e __ 1f __]:
        const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
        // [1c __ 1d __ 1g __ 1h __]:
        const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
        const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
        const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
        const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
        const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
        const m128i mask = _mm_set_epi32(0, -1, -1, -1);
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[0]]), mask, lo128(tmp4));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[1]]), mask, lo128(tmp5));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[2]]), mask, lo128(tmp6));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[3]]), mask, lo128(tmp7));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[4]]), mask, hi128(tmp4));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[5]]), mask, hi128(tmp5));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[6]]), mask, hi128(tmp6));
        _mm_maskstore_ps(reinterpret_cast<MayAlias<float> *>(&data[i[7]]), mask, hi128(tmp7));
#else
        interleave(data, i, v0, v1);
        v2.scatter(data + 2, i);
#endif
    }/*}}}*/
    template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
            const typename V::AsArg v0, const typename V::AsArg v1,
            const typename V::AsArg v2, const typename V::AsArg v3)
    {
        using namespace AVX;
        const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
        const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
        const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
        const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
        const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
        const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
        const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
        const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[0]]), lo128(tmp4));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[1]]), lo128(tmp5));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[2]]), lo128(tmp6));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[3]]), lo128(tmp7));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[4]]), hi128(tmp4));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[5]]), hi128(tmp5));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[6]]), hi128(tmp6));
        _mm_storeu_ps(reinterpret_cast<MayAlias<float> *>(&data[i[7]]), hi128(tmp7));
    }/*}}}*/
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1)
    {
        using namespace AVX;
        const m128  il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]])); // a0 b0
        const m128  il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]])); // a2 b2
        const m128  il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]])); // a4 b4
        const m128  il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]])); // a6 b6
        const m128 il01 = _mm_loadh_pi(             il0, reinterpret_cast<__m64 const *>(&data[i[1]])); // a0 b0 a1 b1
        const m128 il23 = _mm_loadh_pi(             il2, reinterpret_cast<__m64 const *>(&data[i[3]])); // a2 b2 a3 b3
        const m128 il45 = _mm_loadh_pi(             il4, reinterpret_cast<__m64 const *>(&data[i[5]])); // a4 b4 a5 b5
        const m128 il67 = _mm_loadh_pi(             il6, reinterpret_cast<__m64 const *>(&data[i[7]])); // a6 b6 a7 b7

        const m256 tmp2 = concat(il01, il45);
        const m256 tmp3 = concat(il23, il67);

        const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
        const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);

        v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
        v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
    }/*}}}*/
    static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
    {
        using namespace AVX;
        const m256 il0123 = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]])); // a0 b0 a1 b1 a2 b2 a3 b3
        const m256 il4567 = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[4]])); // a4 b4 a5 b5 a6 b6 a7 b7

        const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
        const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);

        const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
        const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);

        v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
        v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        using namespace AVX;
        const m128  il0 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]])); // a0 b0 c0 d0
        const m128  il1 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[1]])); // a1 b1 c1 d1
        const m128  il2 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[2]])); // a2 b2 c2 d2
        const m128  il3 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[3]])); // a3 b3 c3 d3
        const m128  il4 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[4]])); // a4 b4 c4 d4
        const m128  il5 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[5]])); // a5 b5 c5 d5
        const m128  il6 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[6]])); // a6 b6 c6 d6
        const m128  il7 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[7]])); // a7 b7 c7 d7

        const m256 il04 = concat(il0, il4);
        const m256 il15 = concat(il1, il5);
        const m256 il26 = concat(il2, il6);
        const m256 il37 = concat(il3, il7);
        const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
        const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
        const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
        const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
        v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
        v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
        v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        using namespace AVX;
        const m128  il0 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]])); // a0 b0 c0 d0
        const m128  il1 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[1]])); // a1 b1 c1 d1
        const m128  il2 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[2]])); // a2 b2 c2 d2
        const m128  il3 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[3]])); // a3 b3 c3 d3
        const m128  il4 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[4]])); // a4 b4 c4 d4
        const m128  il5 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[5]])); // a5 b5 c5 d5
        const m128  il6 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[6]])); // a6 b6 c6 d6
        const m128  il7 = _mm_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[7]])); // a7 b7 c7 d7

        const m256 il04 = concat(il0, il4);
        const m256 il15 = concat(il1, il5);
        const m256 il26 = concat(il2, il6);
        const m256 il37 = concat(il3, il7);
        const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
        const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
        const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
        const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
        v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
        v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
        v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
        v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        v4.gather(data + 4, i);
        deinterleave(data, i, v0, v1, v2, v3);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }/*}}}*/
    static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        using namespace AVX;
        const m256 a = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0]]));
        const m256 b = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0] + 1 * V::Size]));
        const m256 c = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0] + 2 * V::Size]));
        const m256 d = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0] + 3 * V::Size]));
        const m256 e = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0] + 4 * V::Size]));
        const m256 f = _mm256_loadu_ps(reinterpret_cast<const MayAlias<float> *>(&data[i[0] + 5 * V::Size]));
        const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
        const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
        const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
        const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
        const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
        const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
        const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
        const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
        const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
        const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
        const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
        const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
        v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
        v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
        v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
        v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
        v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
        v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        deinterleave(data, i, v0, v1, v2, v3);
        deinterleave(data + 4, i, v4, v5, v6, v7);
    }/*}}}*/
};
template<typename V> struct InterleaveImpl<V, 4, 32> {
    template <typename I>  // interleave 2 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1)
    {
        using namespace AVX;
        const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
        const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
        _mm_storeu_pd(&data[i[0]], lo128(tmp0));
        _mm_storeu_pd(&data[i[1]], lo128(tmp1));
        _mm_storeu_pd(&data[i[2]], hi128(tmp0));
        _mm_storeu_pd(&data[i[3]], hi128(tmp1));
    }
    template <typename I>  // interleave 3 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2)
    {
        using namespace AVX;
#ifdef Vc_USE_MASKMOV_SCATTER
        const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
        const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
        const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
        const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());

#if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
        // MSVC needs to be at Version 2012 before _mm256_set_epi64x works
        const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
#else
        const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
#endif
        _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
        _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
        _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
        _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
#else
        interleave(data, i, v0, v1);
        v2.scatter(data + 2, i);
#endif
    }
    template <typename I>  // interleave 4 args{{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3)
    {
        using namespace AVX;
        // 0a 1a 0c 1c:
        const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
        // 0b 1b 0b 1b:
        const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
        // 2a 3a 2c 3c:
        const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
        // 2b 3b 2b 3b:
        const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
        /* The following might be more efficient once 256-bit stores are not split internally into 2
         * 128-bit stores.
        _mm256_storeu_pd(&data[i[0]], Mem::shuffle128<X0, Y0>(tmp0, tmp2));
        _mm256_storeu_pd(&data[i[1]], Mem::shuffle128<X0, Y0>(tmp1, tmp3));
        _mm256_storeu_pd(&data[i[2]], Mem::shuffle128<X1, Y1>(tmp0, tmp2));
        _mm256_storeu_pd(&data[i[3]], Mem::shuffle128<X1, Y1>(tmp1, tmp3));
        */
        _mm_storeu_pd(&data[i[0]  ], lo128(tmp0));
        _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
        _mm_storeu_pd(&data[i[1]  ], lo128(tmp1));
        _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
        _mm_storeu_pd(&data[i[2]  ], hi128(tmp0));
        _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
        _mm_storeu_pd(&data[i[3]  ], hi128(tmp1));
        _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
    }
    template <typename I>  // interleave 5 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4)
    {
        interleave(data, i, v0, v1, v2, v3);
        v4.scatter(data + 4, i);
    }
    template <typename I>  // interleave 6 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5);
    }
    template <typename I>  // interleave 7 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6);
    }
    template <typename I>  // interleave 8 args {{{2
    static inline void interleave(typename V::EntryType *const data, const I &i,
                                  const typename V::AsArg v0, const typename V::AsArg v1,
                                  const typename V::AsArg v2, const typename V::AsArg v3,
                                  const typename V::AsArg v4, const typename V::AsArg v5,
                                  const typename V::AsArg v6, const typename V::AsArg v7)
    {
        interleave(data, i, v0, v1, v2, v3);
        interleave(data + 4, i, v4, v5, v6, v7);
    }
    //}}}2
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1)
    {
        using namespace Vc::AVX;
        const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
        const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));

        v0.data() = _mm256_unpacklo_pd(ab02, ab13);
        v1.data() = _mm256_unpackhi_pd(ab02, ab13);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2)
    {
        v2.gather(data + 2, i);
        deinterleave(data, i, v0, v1);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
    {
        v4.gather(data + 4, i);
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
    {
        v6.gather(data + 6, i);
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
    }/*}}}*/
    template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
            const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
    {
        deinterleave(data, i, v0, v1);
        deinterleave(data + 2, i, v2, v3);
        deinterleave(data + 4, i, v4, v5);
        deinterleave(data + 6, i, v6, v7);
    }/*}}}*/
};
//}}}1
}  // namespace Detail
}  // namespace Vc

#endif  // VC_AVX_DETAIL_H_

// vim: foldmethod=marker

namespace Vc_VERSIONED_NAMESPACE
{
template <typename T> class Mask<T, VectorAbi::Avx>
{
public:
    using abi = VectorAbi::Avx;

    /**
     * The \c EntryType of masks is always bool, independent of \c T.
     */
    typedef bool EntryType;
    using value_type = EntryType;

    using MaskBool = Common::MaskBool<sizeof(T)>;
    /**
     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
     * implementation. This type is useful for the \c sizeof operator in generic functions.
     */
    using VectorEntryType = MaskBool;

    /**
     * The associated Vector<T> type.
     */
    using Vector = AVX2::Vector<T>;

    ///\internal
    using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
    ///\internal
    using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
    ///\internal
    using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;

private:
    typedef const VectorTypeF VArg;
    typedef const VectorTypeD VdArg;
    typedef const VectorTypeI ViArg;

public:
    static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
    static constexpr size_t MemoryAlignment = Size;
    static constexpr std::size_t size() { return Size; }
    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));

private:
    typedef Common::Storage<T, Size> Storage;

public:
    /**
     * The \c VectorType reveals the implementation-specific internal type used for the
     * SIMD type.
     */
    using VectorType = typename Storage::VectorType;

    using EntryReference = Vc::Detail::ElementReference<Mask>;
    using reference = EntryReference;

        // abstracts the way Masks are passed to functions, it can easily be changed to const ref here
#if defined Vc_MSVC && defined _WIN32
        typedef const Mask &AsArg;
#else
        typedef const Mask AsArg;
#endif

        Vc_INTRINSIC Mask() {}
        Vc_INTRINSIC Mask(VArg  x) : d(AVX::avx_cast<VectorType>(x)) {}
        Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
        Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
        Vc_INTRINSIC explicit Mask(bool b)
            : d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
        {
        }
        Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
        Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }

        // implicit cast
        template <typename U>
        Vc_INTRINSIC Mask(U &&rhs,
                          Common::enable_if_mask_converts_implicitly<T, U> = nullarg)
            : d(AVX::avx_cast<VectorType>(
                  Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
                      rhs.dataI())))
        {
        }

#if Vc_IS_VERSION_1
        // explicit cast, implemented via simd_cast (in avx/simd_cast_caller.h)
        template <typename U>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "mask types") Vc_INTRINSIC
            explicit Mask(U &&rhs,
                          Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
#endif

        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }

        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());

        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;

        Vc_INTRINSIC Mask &operator=(const Mask &) = default;
        Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
        Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;

        // specializations in mask.tcc
        Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
        { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }

        Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
        { return !operator==(rhs); }

        Vc_INTRINSIC Mask operator!() const { return Detail::andnot_(data(), Detail::allone<VectorTypeF>()); }

        Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
        Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
        Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }

        Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
        Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
        Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }

        Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
        Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }

        // no need for expression template optimizations because cmp(n)eq for floats are not bitwise
        // compares
        Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;

        Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
        Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }

        Vc_INTRINSIC VectorTypeF data () const { return AVX::avx_cast<VectorTypeF>(d.v()); }
        Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
        Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }

private:
    friend reference;
    static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
    {
        return m.toInt() & (1 << i);
    }
    template <typename U>
    static Vc_INTRINSIC void set(Mask &m, int i,
                                 U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
    {
        m.d.set(i, MaskBool(std::forward<U>(v)));
    }

public:
    /**
     * \note the returned object models the concept of a reference and
     * as such it can exist longer than the data it is referencing.
     * \note to avoid lifetime issues, we strongly advice not to store
     * any reference objects.
     */
    Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
    {
        return {*this, int(index)};
    }
    Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
    {
        return get(*this, index);
    }

        Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
        Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }

        template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;

    private:
#ifdef Vc_COMPILE_BENCHMARKS
    public:
#endif
        Storage d;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;

}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

namespace Vc_VERSIONED_NAMESPACE
{
// store {{{1
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
{
    Detail::mask_store<Size>(dataI(), mem, f);
}

// load {{{1
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
{
    d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
}

// operator[] {{{1
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
                                                   int index) noexcept
{
    return m.shiftMask() & (1 << 2 * index);
}
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
                                                    int index) noexcept
{
    return m.shiftMask() & (1 << 2 * index);
}
#endif
// operator== {{{1
template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
{ return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
#endif

// isFull, isNotEmpty, isEmpty, isMix specializations{{{1
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
    if (sizeof(T) == 8) {
        return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
    } else if (sizeof(T) == 4) {
        return 0 != Detail::testc(data (), Detail::allone<VectorTypeF>());
    } else {
        return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
    }
}

template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
    if (sizeof(T) == 8) {
        return 0 == Detail::testz(dataD(), dataD());
    } else if (sizeof(T) == 4) {
        return 0 == Detail::testz(data (), data ());
    } else {
        return 0 == Detail::testz(dataI(), dataI());
    }
}

template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
    if (sizeof(T) == 8) {
        return 0 != Detail::testz(dataD(), dataD());
    } else if (sizeof(T) == 4) {
        return 0 != Detail::testz(data (), data ());
    } else {
        return 0 != Detail::testz(dataI(), dataI());
    }
}

template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
    if (sizeof(T) == 8) {
        return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
    } else if (sizeof(T) == 4) {
        return 0 != Detail::testnzc(data (), Detail::allone<VectorTypeF>());
    } else {
        return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
    }
}

// generate {{{1
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
{
    return _mm256_setr_epi64x(
        gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
        gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
{
    return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
                             gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
                             gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
                             gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
{
    return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
                             gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
                             gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
                             gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
                             gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
                             gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
                             gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
                             gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
}
template <typename T>
template <typename G>
Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
{
    return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
                                  std::integral_constant<int, Size + sizeof(Storage)>());
}
// shifted {{{1
template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
{
    switch (amount * int(sizeof(VectorEntryType))) {
    case   0: return *this;
    case   1: return Detail::shifted<  1>(dataI());
    case   2: return Detail::shifted<  2>(dataI());
    case   3: return Detail::shifted<  3>(dataI());
    case   4: return Detail::shifted<  4>(dataI());
    case   5: return Detail::shifted<  5>(dataI());
    case   6: return Detail::shifted<  6>(dataI());
    case   7: return Detail::shifted<  7>(dataI());
    case   8: return Detail::shifted<  8>(dataI());
    case   9: return Detail::shifted<  9>(dataI());
    case  10: return Detail::shifted< 10>(dataI());
    case  11: return Detail::shifted< 11>(dataI());
    case  12: return Detail::shifted< 12>(dataI());
    case  13: return Detail::shifted< 13>(dataI());
    case  14: return Detail::shifted< 14>(dataI());
    case  15: return Detail::shifted< 15>(dataI());
    case  16: return Detail::shifted< 16>(dataI());
    case  17: return Detail::shifted< 17>(dataI());
    case  18: return Detail::shifted< 18>(dataI());
    case  19: return Detail::shifted< 19>(dataI());
    case  20: return Detail::shifted< 20>(dataI());
    case  21: return Detail::shifted< 21>(dataI());
    case  22: return Detail::shifted< 22>(dataI());
    case  23: return Detail::shifted< 23>(dataI());
    case  24: return Detail::shifted< 24>(dataI());
    case  25: return Detail::shifted< 25>(dataI());
    case  26: return Detail::shifted< 26>(dataI());
    case  27: return Detail::shifted< 27>(dataI());
    case  28: return Detail::shifted< 28>(dataI());
    case  29: return Detail::shifted< 29>(dataI());
    case  30: return Detail::shifted< 30>(dataI());
    case  31: return Detail::shifted< 31>(dataI());
    case  -1: return Detail::shifted< -1>(dataI());
    case  -2: return Detail::shifted< -2>(dataI());
    case  -3: return Detail::shifted< -3>(dataI());
    case  -4: return Detail::shifted< -4>(dataI());
    case  -5: return Detail::shifted< -5>(dataI());
    case  -6: return Detail::shifted< -6>(dataI());
    case  -7: return Detail::shifted< -7>(dataI());
    case  -8: return Detail::shifted< -8>(dataI());
    case  -9: return Detail::shifted< -9>(dataI());
    case -10: return Detail::shifted<-10>(dataI());
    case -11: return Detail::shifted<-11>(dataI());
    case -12: return Detail::shifted<-12>(dataI());
    case -13: return Detail::shifted<-13>(dataI());
    case -14: return Detail::shifted<-14>(dataI());
    case -15: return Detail::shifted<-15>(dataI());
    case -16: return Detail::shifted<-16>(dataI());
    case -17: return Detail::shifted<-17>(dataI());
    case -18: return Detail::shifted<-18>(dataI());
    case -19: return Detail::shifted<-19>(dataI());
    case -20: return Detail::shifted<-20>(dataI());
    case -21: return Detail::shifted<-21>(dataI());
    case -22: return Detail::shifted<-22>(dataI());
    case -23: return Detail::shifted<-23>(dataI());
    case -24: return Detail::shifted<-24>(dataI());
    case -25: return Detail::shifted<-25>(dataI());
    case -26: return Detail::shifted<-26>(dataI());
    case -27: return Detail::shifted<-27>(dataI());
    case -28: return Detail::shifted<-28>(dataI());
    case -29: return Detail::shifted<-29>(dataI());
    case -30: return Detail::shifted<-30>(dataI());
    case -31: return Detail::shifted<-31>(dataI());
    }
    return Zero();
}
// }}}1

/*
template<> Vc_INTRINSIC AVX2::Mask< 4, 32> &AVX2::Mask< 4, 32>::operator=(const std::array<bool, 4> &values) {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    unsigned int x = *reinterpret_cast<const unsigned int *>(values.data());
    x *= 0xffu;
    __m128i y = _mm_cvtsi32_si128(x); //  4 Bytes
    y = _mm_unpacklo_epi8(y, y);    //  8 Bytes
    y = _mm_unpacklo_epi16(y, y);   // 16 Bytes
    d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(y, y), _mm_unpackhi_epi32(y, y)));
    return *this;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 32> &AVX2::Mask< 8, 32>::operator=(const std::array<bool, 8> &values) {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
    x *= 0xffull;
    __m128i y = _mm_cvtsi64_si128(x); //  8 Bytes
    y = _mm_unpacklo_epi8(y, y);   // 16 Bytes
    d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(y, y), _mm_unpackhi_epi16(y, y)));
    return *this;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 16> &AVX2::Mask< 8, 16>::operator=(const std::array<bool, 8> &values) {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
    x *= 0xffull;
    __m128i y = _mm_cvtsi64_si128(x); //  8 Bytes
    d.v() = AVX::avx_cast<__m128>(_mm_unpacklo_epi8(y, y));
    return *this;
}
template<> Vc_INTRINSIC AVX2::Mask<16, 16> &AVX2::Mask<16, 16>::operator=(const std::array<bool, 16> &values) {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    __m128i x = _mm_loadu_si128(reinterpret_cast<const __m128i *>(values.data()));
    d.v() = _mm_andnot_ps(AVX::_mm_setallone_ps(), AVX::avx_cast<__m128>(_mm_sub_epi8(x, _mm_set1_epi8(1))));
    return *this;
}

template<> Vc_INTRINSIC AVX2::Mask< 4, 32>::operator std::array<bool, 4>() const {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 64bit -> 32bit
    x = _mm_packs_epi32(x, x); // 32bit -> 16bit
    x = _mm_srli_epi16(x, 15);
    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
    std::array<bool, 4> r;
    asm volatile("vmovd %1,%0" : "=m"(*r.data()) : "x"(x));
    return r;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 32>::operator std::array<bool, 8>() const {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 32bit -> 16bit
    x = _mm_srli_epi16(x, 15);
    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
    std::array<bool, 8> r;
    asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
    return r;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 16>::operator std::array<bool, 8>() const {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    __m128i x = _mm_srli_epi16(dataI(), 15);
    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
    std::array<bool, 8> r;
    asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
    return r;
}
template<> Vc_INTRINSIC AVX2::Mask<16, 16>::operator std::array<bool, 16>() const {
    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
    __m128 x = _mm_and_ps(d.v(), AVX::avx_cast<__m128>(_mm_set1_epi32(0x01010101)));
    std::array<bool, 16> r;
    asm volatile("vmovups %1,%0" : "=m"(*r.data()) : "x"(x));
    return r;
}
*/

}

// vim: foldmethod=marker

#endif // VC_AVX_MASK_H_
#include <algorithm>
#include <cmath>

#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename Abi> struct VectorTraits
{
    using mask_type = Vc::Mask<T, Abi>;
    using vector_type = Vc::Vector<T, Abi>;
    using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
    using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
};
}  // namespace Detail

#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Avx>
{
public:
    using abi = VectorAbi::Avx;

private:
    using traits_type = Detail::VectorTraits<T, abi>;
    static_assert(
        std::is_arithmetic<T>::value,
        "Vector<T> only accepts arithmetic builtin types as template parameter T.");

    using WriteMaskedVector = typename traits_type::writemasked_vector_type;

public:
    using VectorType = typename traits_type::intrinsic_type;
    using vector_type = VectorType;

    using mask_type = typename traits_type::mask_type;
    using Mask = mask_type;
    using MaskType = mask_type;
    using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
    using MaskArgument = typename Mask::AsArg;
    using reference = Detail::ElementReference<Vector>;

    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));

    using EntryType = typename Common::ensure_alignment_equals_sizeof<T>::type;
        using value_type = EntryType;
        typedef EntryType VectorEntryType;
        static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
        static constexpr size_t MemoryAlignment = alignof(VectorType);
        enum Constants {
            HasVectorDivision = AVX::HasVectorDivisionHelper<T>::Value
        };
#ifdef Vc_IMPL_AVX2
        typedef typename std::conditional<
            (Size >= 8), SimdArray<int, Size, AVX2::int_v, 8>,
            typename std::conditional<(Size >= 4), SimdArray<int, Size, SSE::int_v, 4>,
                                      SimdArray<int, Size, Scalar::int_v, 1>>::type>::type
            IndexType;
#else
        typedef typename std::conditional<(Size >= 4),
                                          SimdArray<int, Size, SSE::int_v, 4>,
                                          SimdArray<int, Size, Scalar::int_v, 1>>::type IndexType;
#endif
        typedef Vector<T, abi> AsArg;
        typedef VectorType VectorTypeArg;

    protected:
        template <typename U> using V = Vector<U, abi>;

        // helper that specializes on VectorType
        typedef AVX::VectorHelper<VectorType> HV;

        // helper that specializes on T
        typedef AVX::VectorHelper<T> HT;

        // cast any m256/m128 to VectorType
        template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
        {
            return AVX::avx_cast<VectorType>(v);
        }

        typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
        StorageType d;

        using WidthT = Common::WidthT<VectorType>;
        // ICC can't compile this:
        // static constexpr WidthT Width = WidthT();

    public:

        static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;

        ///////////////////////////////////////////////////////////////////////////////////////////
        // internal: required to enable returning objects of VectorType
        Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}

        // implict conversion from compatible Vector<U, abi>
        template <typename U>
        Vc_INTRINSIC Vector(
            V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
                                            void *>::type = nullptr)
            : d(AVX::convert<U, T>(x.data()))
        {
        }

#if Vc_IS_VERSION_1
        // static_cast from the remaining Vector<U, abi>
        template <typename U>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "vector types") Vc_INTRINSIC explicit Vector(
            V<U> x,
            typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
                                    void *>::type = nullptr)
            : d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
        {
        }

        // static_cast from other types, implemented via the non-member simd_cast function in
        // simd_cast_caller.tcc
        template <typename U,
                  typename = enable_if<Traits::is_simd_vector<U>::value &&
                                       !std::is_same<Vector, Traits::decay<U>>::value>>
        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
                      "vector types") Vc_INTRINSIC_L
            explicit Vector(U &&x) Vc_INTRINSIC_R;
#endif

        ///////////////////////////////////////////////////////////////////////////////////////////
        // broadcast
        Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
        template <typename U>
        Vc_INTRINSIC Vector(U a,
                            typename std::enable_if<std::is_same<U, int>::value &&
                                                        !std::is_same<U, EntryType>::value,
                                                    void *>::type = nullptr)
            : Vector(static_cast<EntryType>(a))
        {
        }

        //template<typename U>
        explicit Vector(std::initializer_list<EntryType>)
        {
            static_assert(std::is_same<EntryType, void>::value,
                          "A SIMD vector object cannot be initialized from an initializer list "
                          "because the number of entries in the vector is target-dependent.");
        }


        ///////////////////////////////////////////////////////////////////////////////////////////
        // zeroing
        Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;

        Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
        Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;

#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
        // skip this code for MSVC because it fails to do overload resolution correctly
        Vc_INTRINSIC_L void gatherImplementation(
            const EntryType *mem,
            typename std::conditional<
                Size == 8, AVX2::int_v,
                typename std::conditional<Size == 4, SSE::int_v, void *>::type>::type
                indexes) Vc_INTRINSIC_R;

        template <class MT, class U>
        Vc_INTRINSIC
            enable_if<std::is_arithmetic<MT>::value && std::is_integral<U>::value &&
                          (sizeof(MT) >= sizeof(short)),
                      void>
            gatherImplementation(const MT *mem, const SimdArray<U, Size> &indexes)
        {
            *this = simd_cast<Vector>(SimdArray<MT, Size>(mem, indexes));
        }

        template <class U>
        Vc_INTRINSIC enable_if<std::is_integral<U>::value && sizeof(EntryType) == 2, void>
        gatherImplementation(const EntryType *mem, const SimdArray<U, 16> &indexes)
        {
            const auto lo = simd_cast<AVX2::int_v, 0>(indexes);
            const auto hi = simd_cast<AVX2::int_v, 1>(indexes);
            *this = simd_cast<Vector>(
                AVX2::int_v(_mm256_i32gather_epi32(
                    reinterpret_cast<const MayAlias<int> *>(mem), lo.data(), 2)),
                AVX2::int_v(_mm256_i32gather_epi32(
                    reinterpret_cast<const MayAlias<int> *>(mem), hi.data(), 2)));
        }

        template <class U, class V, std::size_t Wt>
        Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 8, void>
        gatherImplementation(const EntryType *mem, const SimdArray<U, 8, V, Wt> &indexes)
        {
            gatherImplementation(mem, simd_cast<AVX2::int_v>(indexes));
        }

        template <class U, class V, std::size_t Wt>
        Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 4, void>
        gatherImplementation(const EntryType *mem, const SimdArray<U, 4, V, Wt> &indexes)
        {
            gatherImplementation(mem, simd_cast<SSE::int_v>(indexes));
        }
#endif  // Vc_IMPL_AVX2 && !MSVC

        ///////////////////////////////////////////////////////////////////////////////////////////
        //prefix
        Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
        Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
        //postfix
        Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
        Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }

    private:
        friend reference;
        Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
        {
            return o.d.m(i);
        }
        template <typename U>
        Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
            noexcept(std::declval<value_type &>() = v))
        {
            return o.d.set(i, v);
        }

    public:
        /**
         * \note the returned object models the concept of a reference and
         * as such it can exist longer than the data it is referencing.
         * \note to avoid lifetime issues, we strongly advice not to store
         * any reference objects.
         */
        Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
        {
            static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
            return {*this, int(index)};
        }
        Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
        {
            return d.m(index);
        }

        Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
        Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;

        Vc_INTRINSIC Vc_PURE Mask operator!() const
        {
            return *this == Zero();
        }
        Vc_ALWAYS_INLINE Vector operator~() const
        {
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
            static_assert(std::is_integral<T>::value,
                          "bit-complement can only be used with Vectors of integral type");
#endif
            return Detail::andnot_(data(), Detail::allone<VectorType>());
        }
        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
        Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }

        // shifts
#define Vc_OP_VEC(op)                                                                    \
    Vc_INTRINSIC Vector &operator op##=(AsArg x);                                        \
    Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const                               \
    {                                                                                    \
        static_assert(                                                                   \
            std::is_integral<T>::value,                                                  \
            "bitwise-operators can only be used with Vectors of integral type");         \
    }
    Vc_ALL_SHIFTS(Vc_OP_VEC);
#undef Vc_OP_VEC

        Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;

        Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
            isNegative() const
        {
            return Vc::isnegative(*this);
        }

        Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
            const VectorType k = _cast(mask.data());
            data() = Detail::blend(data(), v.data(), k);
        }

        template <typename V2>
        Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
            staticCast() const
        {
            return V2(*this);
        }
        template <typename V2>
        Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
            reinterpretCast() const
        {
            return AVX::avx_cast<typename V2::VectorType>(data());
        }

        Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
        {
            return {*this, k};
        }

        Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
        Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }

        template<int Index>
        Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;

        Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;

        Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
        Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
        Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
        Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
        Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
        //template<typename BinaryOperation> Vc_ALWAYS_INLINE_L Vector partialSum(BinaryOperation op) const Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
        Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;

        Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;

        template <typename F> void callWithValuesSorted(F &&f)
        {
            EntryType value = d.m(0);
            f(value);
            for (size_t i = 1; i < Size; ++i) {
                if (d.m(i) != value) {
                    value = d.m(i);
                    f(value);
                }
            }
        }

        template <typename F> Vc_INTRINSIC void call(F &&f) const
        {
            Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
        }

        template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
        {
            for (size_t i : where(mask)) {
                f(EntryType(d.m(i)));
            }
        }

        template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
        {
            Vector r;
            Common::for_all_vector_entries<Size>(
                [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
            return r;
        }

        template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
        {
            Vector r(*this);
            for (size_t i : where(mask)) {
                r.d.set(i, f(EntryType(r.d.m(i))));
            }
            return r;
        }

        template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
        }
        Vc_INTRINSIC void fill(EntryType (&f)()) {
            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
        }

        template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;

        Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
            copySign(AsArg reference) const
        {
            return Vc::copysign(*this, reference);
        }

        Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
        {
            Vc::exponent(*this);
        }

        Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
        Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;

static_assert(Traits::is_simd_vector<AVX2::double_v>::value, "is_simd_vector<double_v>::value");
static_assert(Traits::is_simd_vector<AVX2:: float_v>::value, "is_simd_vector< float_v>::value");
static_assert(Traits::is_simd_vector<AVX2::   int_v>::value, "is_simd_vector<   int_v>::value");
static_assert(Traits::is_simd_vector<AVX2::  uint_v>::value, "is_simd_vector<  uint_v>::value");
static_assert(Traits::is_simd_vector<AVX2:: short_v>::value, "is_simd_vector< short_v>::value");
static_assert(Traits::is_simd_vector<AVX2::ushort_v>::value, "is_simd_vector<ushort_v>::value");
static_assert(Traits::is_simd_mask  <AVX2::double_m>::value, "is_simd_mask  <double_m>::value");
static_assert(Traits::is_simd_mask  <AVX2:: float_m>::value, "is_simd_mask  < float_m>::value");
static_assert(Traits::is_simd_mask  <AVX2::   int_m>::value, "is_simd_mask  <   int_m>::value");
static_assert(Traits::is_simd_mask  <AVX2::  uint_m>::value, "is_simd_mask  <  uint_m>::value");
static_assert(Traits::is_simd_mask  <AVX2:: short_m>::value, "is_simd_mask  < short_m>::value");
static_assert(Traits::is_simd_mask  <AVX2::ushort_m>::value, "is_simd_mask  <ushort_m>::value");

#ifdef Vc_IMPL_AVX2
static_assert(!std::is_convertible<float *, AVX2::short_v>::value, "A float* should never implicitly convert to short_v. Something is broken.");
static_assert(!std::is_convertible<int *  , AVX2::short_v>::value, "An int* should never implicitly convert to short_v. Something is broken.");
static_assert(!std::is_convertible<short *, AVX2::short_v>::value, "A short* should never implicitly convert to short_v. Something is broken.");
#endif

#define Vc_CONDITIONAL_ASSIGN(name_, op_)                                                \
    template <Operator O, typename T, typename M, typename U>                            \
    Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign(               \
        AVX2::Vector<T> &lhs, M &&mask, U &&rhs)                                         \
    {                                                                                    \
        lhs(mask) op_ rhs;                                                               \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(          Assign,  =);
Vc_CONDITIONAL_ASSIGN(      PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN(     MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN(  MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN(    DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN(       XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN(       AndAssign, &=);
Vc_CONDITIONAL_ASSIGN(        OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN

#define Vc_CONDITIONAL_ASSIGN(name_, expr_)                                              \
    template <Operator O, typename T, typename M>                                        \
    Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign(    \
        AVX2::Vector<T> &lhs, M &&mask)                                                  \
    {                                                                                    \
        return expr_;                                                                    \
    }                                                                                    \
    Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN

}  // namespace Vc

/*  This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_LIMITS_H_
#define VC_AVX_LIMITS_H_


namespace std
{
#define Vc_NUM_LIM(T, _max, _min)                                                        \
    template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> {  \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT               \
        {                                                                                \
            return _max;                                                                 \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT               \
        {                                                                                \
            return _min;                                                                 \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT            \
        {                                                                                \
            return min();                                                                \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT           \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT       \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT          \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT         \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT     \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT        \
        {                                                                                \
            return Vc::AVX2::Vector<T>::Zero();                                          \
        }                                                                                \
    }

#ifdef Vc_IMPL_AVX2
Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM(         short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
Vc_NUM_LIM(  unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM(           int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
#endif
#undef Vc_NUM_LIM

} // namespace std

#endif // VC_AVX_LIMITS_H_
/*  This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_CONST_H_
#define VC_AVX_CONST_H_

#include <cstddef>

namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
    template<typename T> struct IndexesFromZeroData;
    template<> struct IndexesFromZeroData<int> {
        static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
    };
    template<> struct IndexesFromZeroData<unsigned int> {
        static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
    };
    template<> struct IndexesFromZeroData<short> {
        static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
    };
    template<> struct IndexesFromZeroData<unsigned short> {
        static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
    };
    template<> struct IndexesFromZeroData<signed char> {
        static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
    };
    template<> struct IndexesFromZeroData<char> {
        static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
    };
    template<> struct IndexesFromZeroData<unsigned char> {
        static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
    };

    template<typename _T> struct Const
    {
        typedef Vector<_T> V;
        typedef typename V::EntryType T;
        typedef typename V::Mask M;

        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4()        { return V(c_trig<T>::data[0]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi()     { return V(c_trig<T>::data[1]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1()   { return V(c_trig<T>::data[2]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2()   { return V(c_trig<T>::data[3]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _1_16()        { return V(c_trig<T>::data[4]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _16()          { return V(c_trig<T>::data[5]); }

        static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i)   { return V(c_trig<T>::data[( 8 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i)   { return V(c_trig<T>::data[(14 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i)      { return V(c_trig<T>::data[(24 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i)      { return V(c_trig<T>::data[(29 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi()      { return V(c_trig<T>::data[34]); }
        static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo()      { return V(c_trig<T>::data[35]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem()       { return V(c_trig<T>::data[36]); }
        static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold()   { return V(c_trig<T>::data[20]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _4_pi()           { return V(c_trig<T>::data[21]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi_2()           { return V(c_trig<T>::data[22]); }
        static Vc_ALWAYS_INLINE Vc_CONST V _pi()             { return V(c_trig<T>::data[23]); }
        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(40 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(45 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(49 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(55 + i)]); }
        static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput()  { return V(c_trig<T>::data[37]); }
        static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput()  { return V(c_trig<T>::data[38]); }

        static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
        static Vc_ALWAYS_INLINE Vc_CONST V _1_2()         { return V(c_log<T>::d(18)); }
        static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2()     { return V(c_log<T>::d(15)); }
        static Vc_ALWAYS_INLINE Vc_CONST V P(int i)       { return V(c_log<T>::d(2 + i)); }
        static Vc_ALWAYS_INLINE Vc_CONST V Q(int i)       { return V(c_log<T>::d(8 + i)); }
        static Vc_ALWAYS_INLINE Vc_CONST V min()          { return V(c_log<T>::d(14)); }
        static Vc_ALWAYS_INLINE Vc_CONST V ln2_small()    { return V(c_log<T>::d(17)); }
        static Vc_ALWAYS_INLINE Vc_CONST V ln2_large()    { return V(c_log<T>::d(16)); }
        static Vc_ALWAYS_INLINE Vc_CONST V neginf()       { return V(c_log<T>::d(13)); }
        static Vc_ALWAYS_INLINE Vc_CONST V log10_e()      { return V(c_log<T>::d(19)); }
        static Vc_ALWAYS_INLINE Vc_CONST V log2_e()       { return V(c_log<T>::d(20)); }

        static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
    };

    template<> Vc_ALWAYS_INLINE Vc_CONST Vector<float>  Const<float>::highMask() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::highMaskFloat)); }
    template<> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)); }
}  // namespace AVX

namespace AVX2
{
using AVX::IndexesFromZeroData;
using AVX::Const;
}  // namespace AVX2
}  // namespace Vc

#endif // VC_AVX_CONST_H_

namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// compare operators {{{1
Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator==(AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2::  uint_m operator==(AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }

Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator!=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2::  uint_m operator!=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }

Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator>=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2::  uint_m operator>=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }

Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator<=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2::  uint_m operator<=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }

Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator> (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2::  uint_m operator> (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }

Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::   int_m operator< (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2::  uint_m operator< (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }

// bitwise operators {{{1
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return xor_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return and_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return or_(a.data(), b.data());
}
// }}}1
// arithmetic operators {{{1
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return add(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return sub(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return mul(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return div(a.data(), b.data(), T());
}
Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
                                            AVX2::Vector<ushort> b)
{
    using namespace AVX;
    const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
                                    convert<ushort, float>(lo128(b.data())));
    const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
                                    convert<ushort, float>(hi128(b.data())));
    const float_v threshold = 32767.f;
    using Detail::operator>;
    const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
                                ? convert<float, ushort>(lo)
                                : convert<float, short>(lo);
    const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
                                ? convert<float, ushort>(hi)
                                : convert<float, short>(hi);
    return concat(loShort, hiShort);
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
    AVX2::Vector<T> a, AVX2::Vector<T> b)
{
    return a - a / b * b;
}
// }}}1
}  // namespace Detail
///////////////////////////////////////////////////////////////////////////////////////////
// generate {{{1
template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
#ifdef Vc_IMPL_AVX2
template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    const auto tmp8 = gen(8);
    const auto tmp9 = gen(9);
    const auto tmp10 = gen(10);
    const auto tmp11 = gen(11);
    const auto tmp12 = gen(12);
    const auto tmp13 = gen(13);
    const auto tmp14 = gen(14);
    const auto tmp15 = gen(15);
    return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
{
    const auto tmp0 = gen(0);
    const auto tmp1 = gen(1);
    const auto tmp2 = gen(2);
    const auto tmp3 = gen(3);
    const auto tmp4 = gen(4);
    const auto tmp5 = gen(5);
    const auto tmp6 = gen(6);
    const auto tmp7 = gen(7);
    const auto tmp8 = gen(8);
    const auto tmp9 = gen(9);
    const auto tmp10 = gen(10);
    const auto tmp11 = gen(11);
    const auto tmp12 = gen(12);
    const auto tmp13 = gen(13);
    const auto tmp14 = gen(14);
    const auto tmp15 = gen(15);
    return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
#endif

// constants {{{1
template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}

template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vector<   int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
template <> Vc_INTRINSIC Vector<  uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
#endif

template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
    VectorSpecialInitializerIndexesFromZero)
    : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
{
}

template <>
Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
    : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}
template <>
Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
    : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}

///////////////////////////////////////////////////////////////////////////////////////////
// load member functions {{{1
// general load, implemented via LoadHelper {{{2
template <typename DstT>
template <typename SrcT, typename Flags>
Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
#ifndef Vc_MSVC
template
#endif
load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
{
    Common::handleLoadPrefetches(mem, flags);
    d.v() = Detail::load<VectorType, DstT>(mem, flags);
}

///////////////////////////////////////////////////////////////////////////////////////////
// zeroing {{{1
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
{
    data() = Detail::zero<VectorType>();
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
{
    data() = Detail::andnot_(AVX::avx_cast<VectorType>(k.data()), data());
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
{
    data() = Detail::and_(AVX::avx_cast<VectorType>(k.data()), data());
}

template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
{
    data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
{
    data() = _mm256_or_pd(data(), k.dataD());
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
{
    data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
{
    data() = _mm256_or_ps(data(), k.data());
}

///////////////////////////////////////////////////////////////////////////////////////////
// stores {{{1
template <typename T>
template <typename U,
          typename Flags,
          typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
{
    Common::handleStorePrefetches(mem, flags);
    HV::template store<Flags>(mem, data());
}

template <typename T>
template <typename U,
          typename Flags,
          typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
{
    Common::handleStorePrefetches(mem, flags);
    HV::template store<Flags>(mem, data(), AVX::avx_cast<VectorType>(mask.data()));
}

///////////////////////////////////////////////////////////////////////////////////////////
// integer ops {{{1
#ifdef Vc_IMPL_AVX2
template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
{
    static_assert(std::is_integral<T>::value,
                  "bitwise-operators can only be used with Vectors of integral type");
    return *this = *this << x;
}
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
{
    static_assert(std::is_integral<T>::value,
                  "bitwise-operators can only be used with Vectors of integral type");
    return *this = *this >> x;
}
#endif

template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
    d.v() = Detail::shiftRight(d.v(), shift, T());
    return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
    return Detail::shiftRight(d.v(), shift, T());
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
    d.v() = Detail::shiftLeft(d.v(), shift, T());
    return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
    return Detail::shiftLeft(d.v(), shift, T());
}

// isnegative {{{1
Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
{
    return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
        AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
}
Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
{
    return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
        AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
}
// gathers {{{1
template <>
template <typename MT, typename IT>
inline void AVX2::double_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::float_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_ps(mem[indexes[0]],
                           mem[indexes[1]],
                           mem[indexes[2]],
                           mem[indexes[3]],
                           mem[indexes[4]],
                           mem[indexes[5]],
                           mem[indexes[6]],
                           mem[indexes[7]]);
}

#ifdef Vc_IMPL_AVX2
#ifndef Vc_MSVC
// skip this code for MSVC because it fails to do overload resolution correctly
template <>
Vc_INTRINSIC void AVX2::double_v::gatherImplementation(const double *mem,
                                                       SSE::int_v indexes)
{
    d.v() = _mm256_i32gather_pd(mem, indexes.data(), sizeof(double));
}

template <>
Vc_INTRINSIC void AVX2::float_v::gatherImplementation(const float *mem,
                                                      AVX2::int_v indexes)
{
    d.v() = _mm256_i32gather_ps(mem, indexes.data(), sizeof(float));
}

template <>
Vc_INTRINSIC void AVX2::int_v::gatherImplementation(const int *mem,
                                                    AVX2::int_v indexes)
{
    d.v() = _mm256_i32gather_epi32(mem, indexes.data(), sizeof(int));
}

template <>
Vc_INTRINSIC void AVX2::uint_v::gatherImplementation(const uint *mem,
                                                     AVX2::int_v indexes)
{
    d.v() = _mm256_i32gather_epi32(reinterpret_cast<const MayAlias<int> *>(mem), indexes.data(),
                                   sizeof(unsigned));
}
#endif  // !Vc_MSVC

template <>
template <typename MT, typename IT>
inline void AVX2::int_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
                              mem[indexes[6]], mem[indexes[7]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::uint_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
                              mem[indexes[6]], mem[indexes[7]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::short_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
                              mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
                              mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
                              mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
                              mem[indexes[15]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::ushort_v::gatherImplementation(const MT *mem, const IT &indexes)
{
    d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
                              mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
                              mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
                              mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
                              mem[indexes[15]]);
}
#endif

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask)
{
    using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
                                            Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
              Common::GatherScatterImplementation::PopcntSwitch
#else
              Common::GatherScatterImplementation::SimpleLoop
#endif
                                                > ;
    Common::executeGather(Selector(), *this, mem, indexes, mask);
}

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
{
    Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
}

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
{
    using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
                                            Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
              Common::GatherScatterImplementation::PopcntSwitch
#else
              Common::GatherScatterImplementation::SimpleLoop
#endif
                                                > ;
    Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
}

///////////////////////////////////////////////////////////////////////////////////////////
// operator- {{{1
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
    return VectorType(-d.builtin());
}
#else
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
    return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
}
#endif

///////////////////////////////////////////////////////////////////////////////////////////
// horizontal ops {{{1
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::minIndex() const
{
    AVX2::Vector<T> x = min();
    return std::make_pair(x, (*this == x).firstOne());
}
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::maxIndex() const
{
    AVX2::Vector<T> x = max();
    return std::make_pair(x, (*this == x).firstOne());
}
template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
{
    /*
    // 28 cycles latency:
    __m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
    x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
    AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
    AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
    idx = _mm256_castps_si256(
        _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
    return std::make_pair(xx, (*this == xx).firstOne());

    __m128 loData = AVX::lo128(d.v());
    __m128 hiData = AVX::hi128(d.v());
    const __m128 less2 = _mm_cmplt_ps(hiData, loData);
    loData = _mm_min_ps(loData, hiData);
    hiData = Mem::permute<X2, X3, X0, X1>(loData);
    const __m128 less1 = _mm_cmplt_ps(hiData, loData);
    loData = _mm_min_ps(loData, hiData);
    hiData = Mem::permute<X1, X0, X3, X2>(loData);
    const __m128 less0 = _mm_cmplt_ps(hiData, loData);
    unsigned bits = _mm_movemask_ps(less0) & 0x1;
    bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
    bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
    loData = _mm_min_ps(loData, hiData);
    return std::make_pair(AVX::concat(loData, loData), bits);
    */

    // 28 cycles Latency:
    __m256 x = d.v();
    __m256 idx = Vector<float>::IndexesFromZero().data();
    __m256 y = Mem::permute128<X1, X0>(x);
    __m256 idy = Mem::permute128<X1, X0>(idx);
    __m256 less = AVX::cmplt_ps(x, y);

    x = _mm256_blendv_ps(y, x, less);
    idx = _mm256_blendv_ps(idy, idx, less);
    y = Reg::permute<X2, X3, X0, X1>(x);
    idy = Reg::permute<X2, X3, X0, X1>(idx);
    less = AVX::cmplt_ps(x, y);

    x = _mm256_blendv_ps(y, x, less);
    idx = _mm256_blendv_ps(idy, idx, less);
    y = Reg::permute<X1, X0, X3, X2>(x);
    idy = Reg::permute<X1, X0, X3, X2>(idx);
    less = AVX::cmplt_ps(x, y);

    idx = _mm256_blendv_ps(idy, idx, less);

    const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
#ifdef Vc_GNU_ASM
    __asm__ __volatile__(""); // help GCC to order the instructions better
#endif
    x = _mm256_blendv_ps(y, x, less);
    return std::make_pair(x, index);
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
{
    //   a    b    c    d    e    f    g    h
    // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
    // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
    // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
    AVX2::Vector<T> tmp = *this;
    if (Size >  1) tmp += tmp.shifted(-1);
    if (Size >  2) tmp += tmp.shifted(-2);
    if (Size >  4) tmp += tmp.shifted(-4);
    if (Size >  8) tmp += tmp.shifted(-8);
    if (Size > 16) tmp += tmp.shifted(-16);
    return tmp;
}

/* This function requires correct masking because the neutral element of \p op is not necessarily 0
 *
template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
{
    //   a    b    c    d    e    f    g    h
    // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
    // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
    // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
    AVX2::Vector<T> tmp = *this;
    Mask mask(true);
    if (Size >  1) tmp(mask) = op(tmp, tmp.shifted(-1));
    if (Size >  2) tmp(mask) = op(tmp, tmp.shifted(-2));
    if (Size >  4) tmp(mask) = op(tmp, tmp.shifted(-4));
    if (Size >  8) tmp(mask) = op(tmp, tmp.shifted(-8));
    if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
    return tmp;
}
*/

template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
{
    AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
    tmp(m) = *this;
    return tmp.min();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
{
    AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
    tmp(m) = *this;
    return tmp.max();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
{
    AVX2::Vector<T> tmp(Vc::One);
    tmp(m) = *this;
    return tmp.product();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
{
    AVX2::Vector<T> tmp(Vc::Zero);
    tmp(m) = *this;
    return tmp.sum();
}//}}}
// exponent {{{1
namespace Detail
{
Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
{
    using namespace AVX;
    __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
    __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
    tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
    tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
    return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
}
Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
{
    using namespace AVX;
    __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
    __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
    tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
    tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
    return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
}
} // namespace Detail

Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
{
    using Detail::operator>=;
    Vc_ASSERT((x >= x.Zero()).isFull());
    return Detail::exponent(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
{
    using Detail::operator>=;
    Vc_ASSERT((x >= x.Zero()).isFull());
    return Detail::exponent(x.data());
}
// }}}1
// Random {{{1
static Vc_ALWAYS_INLINE __m256i _doRandomStep()
{
    using Detail::operator*;
    using Detail::operator+;
#ifdef Vc_IMPL_AVX2
    using AVX2::uint_v;
    uint_v state0(&Common::RandomState[0]);
    uint_v state1(&Common::RandomState[uint_v::Size]);
    (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
    uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
                        _mm256_srli_epi32(state1.data(), 16)))
        .store(&Common::RandomState[0]);
    return state0.data();
#else
    using SSE::uint_v;
    uint_v state0(&Common::RandomState[0]);
    uint_v state1(&Common::RandomState[uint_v::Size]);
    uint_v state2(&Common::RandomState[2 * uint_v::Size]);
    uint_v state3(&Common::RandomState[3 * uint_v::Size]);
    (state2 * uint_v(0xdeece66du) + uint_v(11))
        .store(&Common::RandomState[2 * uint_v::Size]);
    (state3 * uint_v(0xdeece66du) + uint_v(11))
        .store(&Common::RandomState[3 * uint_v::Size]);
    uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
                        _mm_srli_epi32(state2.data(), 16)))
        .store(&Common::RandomState[0]);
    uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
                        _mm_srli_epi32(state3.data(), 16)))
        .store(&Common::RandomState[uint_v::Size]);
    return AVX::concat(state0.data(), state1.data());
#endif
}

#ifdef Vc_IMPL_AVX2
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
{
    return {_doRandomStep()};
}
#endif

template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
{
    return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
                   HT::one());
}

template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
{
    const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
                                       Detail::LoadTag<__m256i, int>());
    for (size_t k = 0; k < 8; k += 2) {
        typedef unsigned long long uint64 Vc_MAY_ALIAS;
        const uint64 stateX = *reinterpret_cast<const uint64 *>(&Common::RandomState[k]);
        *reinterpret_cast<uint64 *>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
    }
    return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
}
// }}}1
// shifted / rotated {{{1
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
{
    return Detail::shifted<EntryType>(d.v(), amount);
}

template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
{
    return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
}
template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
{
    return Mem::shuffle128<X1, Y0>(left, right);
}

template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
{
#ifdef __GNUC__
    if (__builtin_constant_p(amount)) {
        const __m256i a = AVX::avx_cast<__m256i>(d.v());
        const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
        if (amount * 2 == int(Size)) {
            return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
        }
        if (amount * 2 == -int(Size)) {
            return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
        }
        switch (amount) {
        case 1:
            return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
                _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
                                   sizeof(EntryType))
#else  // Vc_IMPL_AVX2
                AVX::concat(
                    _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
                    _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
#endif  // Vc_IMPL_AVX2
                    );
        case 2:
            return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
                _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
                                   2 * sizeof(EntryType))
#else  // Vc_IMPL_AVX2
                AVX::concat(
                    _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
                    _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
#endif  // Vc_IMPL_AVX2
                    );
        case 3:
            if (6u < Size) {
                return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
                    _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
                                       3 * sizeof(EntryType))
#else   // Vc_IMPL_AVX2
                    AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
                                                3 * sizeof(EntryType)),
                                _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
                                                3 * sizeof(EntryType)))
#endif  // Vc_IMPL_AVX2
                        );
            // TODO: } else {
            }
        }
    }
#endif
    using Detail::operator|;
    return shifted(amount) | (amount > 0 ?
                              shiftIn.shifted(amount - Size) :
                              shiftIn.shifted(Size + amount));
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
{
    return Detail::rotated<EntryType, size()>(d.v(), amount);
}
// sorted {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
    const
{
    return Detail::sorted(*this);
}
// interleaveLow/-High {{{1
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
{
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
                                   _mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
{
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
                                   _mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
{
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
                                   _mm256_unpackhi_ps(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
{
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
                                   _mm256_unpackhi_ps(data(), x.data()));
}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveLow (   AVX2::int_v x) const {
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
                                   _mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveHigh(   AVX2::int_v x) const {
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
                                   _mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveLow (  AVX2::uint_v x) const {
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
                                   _mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveHigh(  AVX2::uint_v x) const {
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
                                   _mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
                                   _mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
                                   _mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
                                   _mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
                                   _mm256_unpackhi_epi16(data(), x.data()));
}
#endif
// permutation via operator[] {{{1
template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
}
template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
    Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
        AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
        AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
    Permutation::ReversedTag) const
{
    return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
        AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
        AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
#endif
template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
{
    // TODO
    return *this;
#ifdef Vc_IMPL_AVX2
#else
    /*
    const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
                                  _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
    if (cross128.isNotEmpty()) {
    AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
        x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
        return x;
    } else {
    */
#endif
}

// reversed {{{1
template <typename T>
Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
{
    return (*this)[Permutation::Reversed];
}

// broadcast from constexpr index {{{1
template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
{
    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
    constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
    return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
{
    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
    constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
    return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
// }}}1
}  // namespace Vc

// vim: foldmethod=marker
/*  This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the names of contributing organizations nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

}}}*/

#ifndef VC_AVX_SIMD_CAST_H_
#define VC_AVX_SIMD_CAST_H_

#ifndef VC_AVX_VECTOR_H_
#error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h"
#endif

namespace Vc_VERSIONED_NAMESPACE
{
// Declarations: helper macros Vc_SIMD_CAST_AVX_[124] & Vc_SIMD_CAST_[124] {{{1
#define Vc_SIMD_CAST_AVX_1(from_, to_)                                                   \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        AVX2::from_ x, enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)

#define Vc_SIMD_CAST_AVX_2(from_, to_)                                                   \
    static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(),                          \
                  "this type combination is wrong");                                     \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        AVX2::from_ x0, AVX2::from_ x1,                                                  \
        enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)

#define Vc_SIMD_CAST_AVX_3(from_, to_)                                                   \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2,                                  \
        enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)

#define Vc_SIMD_CAST_AVX_4(from_, to_)                                                   \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3,                  \
        enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)

#define Vc_SIMD_CAST_1(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_2(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_3(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_4(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3,                                          \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_5(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3, from_ x4,                                \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_6(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5,                      \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_7(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6,            \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_8(from_, to_)                                                       \
    template <typename To>                                                               \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7,  \
        enable_if<std::is_same<To, to_>::value> = nullarg)

#define Vc_SIMD_CAST_OFFSET(from_, to_, offset_)                                         \
    static_assert(from_::size() >= to_::size() * (offset_ + 1),                          \
                  "this offset cannot exist for this type combination");                 \
    template <typename To, int offset>                                                   \
    Vc_INTRINSIC Vc_CONST To simd_cast(                                                  \
        from_ x,                                                                         \
        enable_if<(offset == offset_ && std::is_same<To, to_>::value)> = nullarg)

// Declaration: SSE -> AVX where the AVX Vector is integral and thus of equal size() {{{1
// as the equivalent SSE Vector
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
                             SSE::Vector<typename To::EntryType>::Size == To::Size)> =
                      nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
    From x0, From x1,
    enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
               SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
    From x0, From x1, From x2,
    enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
               SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
    From x0, From x1, From x2, From x3,
    enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
               SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
    From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
    enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
               SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);

// Declarations: Vector casts without offset {{{1
// AVX2::Vector {{{2
Vc_SIMD_CAST_AVX_1( float_v, double_v);

Vc_SIMD_CAST_AVX_1(double_v,  float_v);
Vc_SIMD_CAST_AVX_2(double_v,  float_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1(   int_v, double_v);
Vc_SIMD_CAST_AVX_1(  uint_v, double_v);
Vc_SIMD_CAST_AVX_1( short_v, double_v);
Vc_SIMD_CAST_AVX_1(ushort_v, double_v);

Vc_SIMD_CAST_AVX_1(   int_v,  float_v);
Vc_SIMD_CAST_AVX_1(  uint_v,  float_v);
Vc_SIMD_CAST_AVX_1( short_v,  float_v);
Vc_SIMD_CAST_AVX_1(ushort_v,  float_v);

Vc_SIMD_CAST_AVX_1(double_v,    int_v);
Vc_SIMD_CAST_AVX_1( float_v,    int_v);
Vc_SIMD_CAST_AVX_1(  uint_v,    int_v);
Vc_SIMD_CAST_AVX_1( short_v,    int_v);
Vc_SIMD_CAST_AVX_1(ushort_v,    int_v);
Vc_SIMD_CAST_AVX_2(double_v,    int_v);

Vc_SIMD_CAST_AVX_1(double_v,   uint_v);
Vc_SIMD_CAST_AVX_1( float_v,   uint_v);
Vc_SIMD_CAST_AVX_1(   int_v,   uint_v);
Vc_SIMD_CAST_AVX_1( short_v,   uint_v);
Vc_SIMD_CAST_AVX_1(ushort_v,   uint_v);
Vc_SIMD_CAST_AVX_2(double_v,   uint_v);

Vc_SIMD_CAST_AVX_1(double_v,  short_v);
Vc_SIMD_CAST_AVX_1( float_v,  short_v);
Vc_SIMD_CAST_AVX_1(   int_v,  short_v);
Vc_SIMD_CAST_AVX_1(  uint_v,  short_v);
Vc_SIMD_CAST_AVX_1(ushort_v,  short_v);
Vc_SIMD_CAST_AVX_2(double_v,  short_v);
Vc_SIMD_CAST_AVX_2( float_v,  short_v);
Vc_SIMD_CAST_AVX_2(   int_v,  short_v);
Vc_SIMD_CAST_AVX_2(  uint_v,  short_v);
Vc_SIMD_CAST_AVX_3(double_v,  short_v);
Vc_SIMD_CAST_AVX_4(double_v,  short_v);

Vc_SIMD_CAST_AVX_1(double_v, ushort_v);
Vc_SIMD_CAST_AVX_1( float_v, ushort_v);
Vc_SIMD_CAST_AVX_1(   int_v, ushort_v);
Vc_SIMD_CAST_AVX_1(  uint_v, ushort_v);
Vc_SIMD_CAST_AVX_1( short_v, ushort_v);
Vc_SIMD_CAST_AVX_2(double_v, ushort_v);
Vc_SIMD_CAST_AVX_2( float_v, ushort_v);
Vc_SIMD_CAST_AVX_2(   int_v, ushort_v);
Vc_SIMD_CAST_AVX_2(  uint_v, ushort_v);
Vc_SIMD_CAST_AVX_3(double_v, ushort_v);
Vc_SIMD_CAST_AVX_4(double_v, ushort_v);
#endif

// 1 SSE::Vector to 1 AVX2::Vector {{{2
Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE::   int_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v);

Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE::   int_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE::double_v, AVX2::   int_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2::  uint_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v);

Vc_SIMD_CAST_1(SSE:: float_v, AVX2::   int_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::  uint_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v);

Vc_SIMD_CAST_1(SSE::   int_v, AVX2::   int_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2::   int_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::   int_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::   int_v);

Vc_SIMD_CAST_1(SSE::   int_v, AVX2::  uint_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2::  uint_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::  uint_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::  uint_v);

Vc_SIMD_CAST_1(SSE::   int_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v);

Vc_SIMD_CAST_1(SSE::   int_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE::  uint_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v);
#endif

// 2 SSE::Vector to 1 AVX2::Vector {{{2
Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v);

Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE::   int_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE::  uint_v, AVX2:: float_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_2(SSE::double_v, AVX2::   int_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2::  uint_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v);

Vc_SIMD_CAST_2(SSE:: float_v, AVX2::   int_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2::  uint_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v);

Vc_SIMD_CAST_2(SSE::   int_v, AVX2::   int_v);
Vc_SIMD_CAST_2(SSE::  uint_v, AVX2::   int_v);

Vc_SIMD_CAST_2(SSE::   int_v, AVX2::  uint_v);
Vc_SIMD_CAST_2(SSE::  uint_v, AVX2::  uint_v);

Vc_SIMD_CAST_2(SSE::   int_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE::  uint_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v);

Vc_SIMD_CAST_2(SSE::   int_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE::  uint_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v);
#endif

// 3 SSE::Vector to 1 AVX2::Vector {{{2
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_3(SSE::double_v, AVX2::   int_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2::  uint_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v);

Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v);

Vc_SIMD_CAST_3(SSE::   int_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE::  uint_v, AVX2:: short_v);

Vc_SIMD_CAST_3(SSE::   int_v, AVX2::ushort_v);
Vc_SIMD_CAST_3(SSE::  uint_v, AVX2::ushort_v);
#endif

// 4 SSE::Vector to 1 AVX2::Vector {{{2
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_4(SSE::double_v, AVX2::   int_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2::  uint_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v);

Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v);

Vc_SIMD_CAST_4(SSE::   int_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE::  uint_v, AVX2:: short_v);

Vc_SIMD_CAST_4(SSE::   int_v, AVX2::ushort_v);
Vc_SIMD_CAST_4(SSE::  uint_v, AVX2::ushort_v);
#endif

// 5 SSE::Vector to 1 AVX2::Vector {{{2
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v);
#endif

// 6 SSE::Vector to 1 AVX2::Vector {{{2
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v);
#endif

// 7 SSE::Vector to 1 AVX2::Vector {{{2
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v);
#endif

// 8 SSE::Vector to 1 AVX2::Vector {{{2
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v);
#endif

// 1 AVX2::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v);

Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v);

#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2::   int_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::   int_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::   int_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2::   int_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2::   int_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::   int_v, SSE::ushort_v);

Vc_SIMD_CAST_1(AVX2::  uint_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::  uint_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::  uint_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2::  uint_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2::  uint_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::  uint_v, SSE::ushort_v);

Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v);

Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::   int_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::  uint_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v);
#endif

// 2 AVX2::Vector to 1 SSE::Vector {{{2
Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v);
Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v);

// 1 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 2 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 3 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 4 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 5 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 6 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 7 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6,
          enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif

// 8 Scalar::Vector to 1 AVX2::Vector {{{2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, Scalar::Vector<T> x7,
          enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, Scalar::Vector<T> x7,
          enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
          Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
          Scalar::Vector<T> x6, Scalar::Vector<T> x7,
          enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typenam